diff --git a/.gitmodules b/.gitmodules
index 470cf466..107505e0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "third_party/eigen"]
 	path = third_party/eigen
 	url = git@github.com:InfiniTensor/eigen-mirror.git
+[submodule "third_party/flash_attention"]
+	path = third_party/flash_attention
+	url = https://github.com/Dao-AILab/flash-attention.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74536707..a12258e9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,15 +75,24 @@ if(USE_CUDA)
   add_compile_definitions(USE_CUDA=1)
   enable_language(CUDA)
   find_package(CUDAToolkit REQUIRED)
+
+  # ========== cuDNN 库 ==========
+  find_library(CUDNN_LIBRARY cudnn REQUIRED)
+  message(STATUS "Found cuDNN at: ${CUDNN_LIBRARY}")
+  # ========================================
+
   include_directories(${CUDAToolkit_INCLUDE_DIRS})
 
   # CUDA compilation options
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
 
-  # Only compile CUDA kernels / cuda sources here (your original used src/*.cu)
+  # Only compile CUDA kernels / cuda sources here
   file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
 
   add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
+  target_include_directories(infini_train_cuda_kernels PUBLIC 
+    ${PROJECT_SOURCE_DIR}/third_party/cudnn-frontend/include
+  )
   set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
 
   target_link_libraries(infini_train_cuda_kernels
@@ -92,6 +101,7 @@ if(USE_CUDA)
       CUDA::cudart
       CUDA::cublas
       CUDA::cuda_driver
+      ${CUDNN_LIBRARY}
   )
 
   if(USE_NCCL)
@@ -116,8 +126,6 @@ target_link_libraries(infini_train
 )
 
 if(USE_CUDA)
-  # infini_train contains cuda runtime wrappers (*.cc) like cuda_blas_handle.cc/cuda_guard.cc
-  # Those may need CUDA runtime/driver/cublas symbols at final link, so attach them here too.
   target_link_libraries(infini_train
     PUBLIC
       infini_train_cuda_kernels
@@ -127,15 +135,12 @@ if(USE_CUDA)
   )
 
   if(USE_NCCL)
-    # If your core library code also directly references NCCL symbols (not only kernels),
-    # keep this. Otherwise it's harmless.
     target_link_libraries(infini_train PUBLIC nccl)
   endif()
 endif()
 
 # ------------------------------------------------------------------------------
 # Helper: link libraries in a group to fix static lib one-pass resolution
-# (THIS is what fixes "undefined reference" from cuda_kernels -> core symbols)
 # ------------------------------------------------------------------------------
 function(link_infini_train_exe target_name)
   if(USE_CUDA)
@@ -160,7 +165,6 @@ function(link_infini_train_exe target_name)
   endif()
 endfunction()
 
-
 # ------------------------------------------------------------------------------
 # Examples
 # ------------------------------------------------------------------------------
@@ -199,4 +203,4 @@ add_executable(test_hook test/hook/test_hook.cc)
 target_link_libraries(test_hook infini_train)
 
 add_executable(test_precision_check test/hook/test_precision_check.cc)
-target_link_libraries(test_precision_check infini_train)
+target_link_libraries(test_precision_check infini_train)
\ No newline at end of file
diff --git "a/InfiniTrain\346\212\245\345\221\212.md" "b/InfiniTrain\346\212\245\345\221\212.md"
new file mode 100644
index 00000000..17c815a8
--- /dev/null
+++ "b/InfiniTrain\346\212\245\345\221\212.md"
@@ -0,0 +1,88 @@
+# InfiniTrain 作业报告
+
+## 1. 功能正确性验证
+gpt2_1_bfloat16
+![alt text](image-3.png)
+gpt2_bfloat16_flash
+![alt text](image-4.png)
+llama3_1_bfloat16
+![alt text](image-2.png)
+llama3_1_bfloat16_flash
+![alt text](image-5.png)
+
+
+## 2. 性能评估报告
+### 2.1 实验环境说明
+
+**硬件环境**
+- GPU 型号：NVIDIA A100-SXM4-80GB
+- 单卡显存：81920 MiB（80GB）
+- 机器总卡数：8 张（index 0~7）
+- 本次测试可见设备：`CUDA_VISIBLE_DEVICES=4,5,6,7`
+- 实际并行配置：日志中 `DP=1, TP=1, SP=1, PP=1`，即单进程单卡执行
+
+**软件环境**
+- CUDA：12.8（`nvcc` build `cuda_12.8.r12.8`）
+- Driver：570.133.20
+- C++ 编译器：`c++ (Ubuntu 13.3.0) 13.3.0`
+- CMake：3.31.4
+- 编译命令：`cmake -DUSE_CUDA=ON -DUSE_NCCL=ON .. && make -j`
+
+### 2.2 实验配置
+
+基于四个日志文件：
+- `gpt2_1_bfloat16.log`（baseline）
+- `gpt2_1_bfloat16_fla.log`（FlashAttention）
+- `llama3_1_bfloat16.log`（baseline）
+- `llama3_1_bfloat16_fla.log`（FlashAttention）
+
+关键参数（由程序默认参数与命令行确认）：
+- `dtype=bfloat16`
+- `batch_size=4`
+- `sequence_length=64`
+- `total_batch_size=256 tokens/step`
+- 训练步数：10 steps
+- baseline：小算子拼接版本（不加 `--flash true`）
+- 实验组：FlashAttention 融合算子版本（`--flash true`）
+
+> 说明：为减少首步冷启动影响，下面主表采用 **step 2~10** 的均值作为稳态指标。
+
+### 2.3 性能指标定义
+
+- 平均时延（avg latency）：每步迭代耗时均值（ms）
+- 吞吐率（tokens/s）：日志中的每步 tokens/s 均值
+- GPU 显存占用（MB）：日志 `peak used` 的峰值（max）
+- 加速比：$\text{Speedup} = \frac{\text{Latency}_{baseline}}{\text{Latency}_{flash}}$
+- 显存节省比例：$\text{MemSaving} = \frac{\text{Mem}_{baseline}-\text{Mem}_{flash}}{\text{Mem}_{baseline}} \times 100\%$
+
+### 2.4 结果展示（baseline vs FlashAttention）
+
+| 模型 | 方案 | Avg Latency (ms) | Throughput (tok/s) | Peak Used (MB) |
+|---|---|---:|---:|---:|
+| GPT2 | baseline | 119.71 | 2153.67 | 1914 |
+| GPT2 | FlashAttention | 63.58 | 4057.67 | 3056 |
+| LLaMA3 | baseline | 768.33 | 333.78 | 24561 |
+| LLaMA3 | FlashAttention | 336.90 | 765.33 | 26552 |
+
+**汇总指标（按模型聚合）**
+
+| 模型 | Speedup (baseline/flash) | 吞吐提升 (flash/baseline) | 显存节省比例 |
+|---|---:|---:|---:|
+| GPT2 | 1.88x | 1.88x | -59.67% |
+| LLaMA3 | 2.28x | 2.29x | -8.11% |
+
+### 2.5 结论分析
+
+1. **GPT2 上 FlashAttention 提升明显**：
+	- 时延从 119.71 ms 降到 63.58 ms，Speedup 为 **1.88x**；
+	- 吞吐从 2153.67 提升到 4057.67 tok/s（约 **1.88x**）。
+
+2. **LLaMA3 上收益显著**：
+	- 时延从 768.33 ms 降到 336.90 ms，Speedup 为 **2.28x**；
+	- 吞吐从 333.78 提升到 765.33 tok/s（约 **2.29x**）。
+
+3. **显存占用现象**：
+	- GPT2 在本次日志中 FlashAttention 的 `peak used` 更高（1914 MB -> 3056 MB，显存节省比例 -59.67%）；
+	- LLaMA3 在本次日志中 FlashAttention 的 `peak used` 也更高（24561 MB -> 26552 MB，显存节省比例 -8.11%）；
+	- 说明本次实验里 FlashAttention 的收益主要体现在计算效率（时延/吞吐），而非显存降低。
+
diff --git a/example/gpt2/main.cc b/example/gpt2/main.cc
index a007dff1..156fdcab 100644
--- a/example/gpt2/main.cc
+++ b/example/gpt2/main.cc
@@ -78,6 +78,7 @@ DEFINE_string(dtype, "float32", "precision used in training (float32/bfloat16)")
 DEFINE_string(
     precision_check, "",
     "precision check config: level=N,format=simple|table,output_md5=true|false,output_path=PATH,baseline=PATH");
+DEFINE_bool(flash, false, "Whether to enable flash attention");
 
 using namespace infini_train;
 
@@ -140,6 +141,7 @@ void Train(const nn::parallel::Rank &rank) {
 
     if (rank.IsParallel()) {
         device = Device(Device::DeviceType::kCUDA, rank.thread_rank());
+        //
         auto *pg_factory = ProcessGroupFactory::Instance(device.type());
 
         if (ddp_world_size > 1) {
@@ -322,6 +324,10 @@ void Train(const nn::parallel::Rank &rank) {
             }
 
             for (int micro_step = 0; micro_step < grad_accum_steps; ++micro_step) {
+                if (auto dist_optimizer = std::dynamic_pointer_cast<nn::parallel::DistributedOptimizer>(optimizer)) {
+                    dist_optimizer->SetIsLastMicrobatch(micro_step == grad_accum_steps - 1);
+                }
+
                 // enable autocast for the current step
                 infini_train::AutocastGuard autocast_guard(device.type(), dtype);
 
diff --git a/example/gpt2/net.cc b/example/gpt2/net.cc
index 8d497797..6a1e414f 100644
--- a/example/gpt2/net.cc
+++ b/example/gpt2/net.cc
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "glog/logging.h"
+#include "gflags/gflags.h"
 
 #include "example/common/utils.h"
 #include "infini_train/include/device.h"
@@ -29,6 +30,7 @@
 #include "infini_train/include/nn/parallel/utils.h"
 #include "infini_train/include/tensor.h"
 
+
 using namespace infini_train;
 namespace nn = infini_train::nn;
 
@@ -78,6 +80,7 @@ CausalSelfAttention::CausalSelfAttention(const GPT2Config &config)
                                    ->View({1, 1, config_.block_size, config_.block_size});
 }
 
+DECLARE_bool(flash);
 std::vector<std::shared_ptr<infini_train::Tensor>>
 CausalSelfAttention::Forward(const std::vector<std::shared_ptr<infini_train::Tensor>> &x) {
     auto tp_world_size = nn::parallel::global::GetTensorParallelSize();
@@ -96,7 +99,7 @@ CausalSelfAttention::Forward(const std::vector<std::shared_ptr<infini_train::Ten
     auto k = qkv[1];
     auto v = qkv[2];
 
-    // NOTE(zbl): Acquire full T after AllGather is performed in ColumnParallelLinear
+    // NOTE(zbl): Acquire full T after AllGather is performed in ColumnParallelLinear   T->seq_len
     const auto T = q->Dims()[1];
 
     // View to multi-head: local_n_head * head_dim == local_C
@@ -105,18 +108,39 @@ CausalSelfAttention::Forward(const std::vector<std::shared_ptr<infini_train::Ten
     q = q->View({B, T, local_n_head_, head_dim})->Transpose(1, 2);
     v = v->View({B, T, local_n_head_, head_dim})->Transpose(1, 2);
 
-    // (B, h_l, T, T)
-    auto att = q->Matmul(k->Transpose(-2, -1)) * (1.0 / std::sqrt(head_dim));
-    // (1, 1, T, T)
-    auto mask = buffers_[kParamBiasName]->Slice({0, 0, 0, 0}, {1, 1, T, T}, {1, 1, 1, 1});
-    // (1, 1, T, T) -> eq 0 -> (1, 1, T, T) -> masked_fill -> (B, h_l, T, T)
-    att = att->MaskedFill(mask == 0, -std::numeric_limits<float>::infinity());
-    // (B, h_l, T, T)
-    att = nn::function::Softmax(att, -1);
-    // (B, h_l, T, Dh)
-    auto y = att->Matmul(v);
-    // (B, h_l, T, Dh) -> (B, T, h_l, Dh) -> (B, T, local_C)
-    y = y->Transpose(1, 2)->Contiguous()->View({B, T, local_C});
+    std::shared_ptr<infini_train::Tensor> y;
+    if (FLAGS_flash) {
+        // cuDNN SDPA path: causal masking should be enabled by `is_causal=true`.
+        // Do not pass the 0/1 tril mask as additive bias (it is not -inf mask).
+        auto q_flash = q;
+        auto k_flash = k;
+        auto v_flash = v;
+        if (q->Dtype() == DataType::kFLOAT32) {
+            q_flash = std::make_shared<Tensor>(q->To(DataType::kBFLOAT16));
+            k_flash = std::make_shared<Tensor>(k->To(DataType::kBFLOAT16));
+            v_flash = std::make_shared<Tensor>(v->To(DataType::kBFLOAT16));
+        }
+        y = nn::function::ScaledDotProductAttention(q_flash, k_flash, v_flash, nullptr, 0.0, true, std::nullopt,
+                                                    false);
+        if (y->Dtype() != q->Dtype()) {
+            y = std::make_shared<Tensor>(y->To(q->Dtype()));
+        }
+        // ensure expected layout: (B, h_l, T, Dh) -> (B, T, h_l, Dh) -> (B, T, local_C)
+        y = y->Transpose(1, 2)->Contiguous()->View({B, T, local_C});
+    } else {
+        // (B, h_l, T, T)
+        auto att = q->Matmul(k->Transpose(-2, -1)) * (1.0 / std::sqrt(head_dim));
+        // (1, 1, T, T)
+        auto mask = buffers_[kParamBiasName]->Slice({0, 0, 0, 0}, {1, 1, T, T}, {1, 1, 1, 1});
+        // (1, 1, T, T) -> eq 0 -> (1, 1, T, T) -> masked_fill -> (B, h_l, T, T)
+        att = att->MaskedFill(mask == 0, -std::numeric_limits<float>::infinity());
+        // (B, h_l, T, T)
+        att = nn::function::Softmax(att, -1);
+        // (B, h_l, T, Dh)
+        y = att->Matmul(v);
+        // (B, h_l, T, Dh) -> (B, T, h_l, Dh) -> (B, T, local_C)
+        y = y->Transpose(1, 2)->Contiguous()->View({B, T, local_C});
+    }
 
     // Get full tensor
     // (B, T, local_C) -> RowParallelLinear(n_embd, n_embd) -> (B, T, C)
diff --git a/example/llama3/main.cc b/example/llama3/main.cc
index 2b1e2121..eaf96b8d 100644
--- a/example/llama3/main.cc
+++ b/example/llama3/main.cc
@@ -91,6 +91,7 @@ constexpr char kDtypeBF16[] = "bfloat16";
 DEFINE_validator(model, [](const char *, const std::string &value) { return kSupportedModels.contains(value); });
 DEFINE_validator(device,
                  [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_bool(flash, false, "Whether to enable flash attention");
 
 void Train(const nn::parallel::Rank &rank) {
     using namespace nn::parallel;
@@ -298,6 +299,10 @@ void Train(const nn::parallel::Rank &rank) {
             }
 
             for (int micro_step = 0; micro_step < grad_accum_steps; ++micro_step) {
+                if (auto dist_optimizer = std::dynamic_pointer_cast<nn::parallel::DistributedOptimizer>(optimizer)) {
+                    dist_optimizer->SetIsLastMicrobatch(micro_step == grad_accum_steps - 1);
+                }
+
                 // enable autocast for the current step
                 infini_train::AutocastGuard autocast_guard(device.type(), dtype);
 
diff --git a/example/llama3/net.cc b/example/llama3/net.cc
index a50fb831..5285f53f 100644
--- a/example/llama3/net.cc
+++ b/example/llama3/net.cc
@@ -11,6 +11,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "gflags/gflags.h"
 #include "glog/logging.h"
 
 #include "example/common/utils.h"
@@ -138,6 +139,7 @@ std::vector<std::shared_ptr<Tensor>> RMSNorm::Forward(const std::vector<std::sha
     return {norm * parameters_[kParamWeightName]};
 }
 
+DECLARE_bool(flash);
 CausalSelfAttention::CausalSelfAttention(const LLaMA3Config &config)
     : CloneableModule(kType), config_(config), n_head_(config.n_head), n_embd_(config.n_embd),
       n_kv_head_(config.n_kv_head), n_rep_(config.n_head / config.n_kv_head), head_dim_(config.n_embd / config.n_head) {
@@ -217,26 +219,44 @@ std::vector<std::shared_ptr<Tensor>> CausalSelfAttention::Forward(const std::vec
     k = k->Transpose(1, 2);
     v = v->Transpose(1, 2);
 
-    // TODO(zbl): support flash attention later
-    // if (flash_) { ... }
-
-    // manual implementation of attention
-    // this materializes the large (T,T) matrix for all the queries and keys
-
-    // q: (B, H_local, T, D)
-    // k: (B, H_local, T, D) -> (B, H_local, D, T)
-    // q @ k.T: (B, H_local, T, T) -> mul 1.0 / sqrt(D) -> (B, H_local, T, T)
-    auto att = q->Matmul(k->Transpose(-2, -1)) * (1.0 / std::sqrt(static_cast<float>(D)));
-    if (mask) {
-        // mask: (1, 1, T, T)
-        att = att->MaskedFill(mask, std::numeric_limits<float>::lowest());
+    std::shared_ptr<Tensor> y;
+    if (FLAGS_flash) {
+        // cuDNN SDPA path: causal masking should be enabled by `is_causal=true`.
+        // Do not pass Triu(ones, 1) mask as additive bias.
+        auto q_flash = q;
+        auto k_flash = k;
+        auto v_flash = v;
+        if (q->Dtype() == DataType::kFLOAT32) {
+            q_flash = std::make_shared<Tensor>(q->To(DataType::kBFLOAT16));
+            k_flash = std::make_shared<Tensor>(k->To(DataType::kBFLOAT16));
+            v_flash = std::make_shared<Tensor>(v->To(DataType::kBFLOAT16));
+        }
+        y = nn::function::ScaledDotProductAttention(q_flash, k_flash, v_flash, nullptr, 0.0, true, std::nullopt,
+                                                    false);
+        if (y->Dtype() != q->Dtype()) {
+            y = std::make_shared<Tensor>(y->To(q->Dtype()));
+        }
+        // ensure expected layout: (B, H_local, T, D) -> (B, T, H_local, D) -> (B, T, C_local)
+        y = y->Transpose(1, 2)->Contiguous()->View({B, T, C_local});
+    } else {
+        // manual implementation of attention
+        // this materializes the large (T,T) matrix for all the queries and keys
+
+        // q: (B, H_local, T, D)
+        // k: (B, H_local, T, D) -> (B, H_local, D, T)
+        // q @ k.T: (B, H_local, T, T) -> mul 1.0 / sqrt(D) -> (B, H_local, T, T)
+        auto att = q->Matmul(k->Transpose(-2, -1)) * (1.0 / std::sqrt(static_cast<float>(D)));
+        if (mask) {
+            // mask: (1, 1, T, T)
+            att = att->MaskedFill(mask, std::numeric_limits<float>::lowest());
+        }
+        // (B, H_local, T, T)
+        att = nn::function::Softmax(att, -1);
+        // att: (B, H_local, T, T) @ v: (B, H_local, T, D) -> y: (B, H_local, T, D)
+        y = att->Matmul(v);
+        // (B, H_local, T, D) -> Transpose(1, 2) -> (B, T, H_local, D) -> (B, T, C_local)
+        y = y->Transpose(1, 2)->Contiguous()->View({B, T, C_local});
     }
-    // (B, H_local, T, T)
-    att = nn::function::Softmax(att, -1);
-    // att: (B, H_local, T, T) @ v: (B, H_local, T, D) -> y: (B, H_local, T, D)
-    auto y = att->Matmul(v);
-    // (B, H_local, T, D) -> Transpose(1, 2) -> (B, T, H_local, D) -> (B, T, C_local)
-    y = y->Transpose(1, 2)->Contiguous()->View({B, T, C_local});
     // output projection
     // (B, T, C_local) -> RowParallelLinear(C, C) -> (B, T, C)
     y = (*modules_[kCProjLayerName])({y})[0];
diff --git a/example/mnist/main.cc b/example/mnist/main.cc
index e62257d7..4cd7b8f6 100644
--- a/example/mnist/main.cc
+++ b/example/mnist/main.cc
@@ -35,6 +35,7 @@ constexpr char kDeviceCUDA[] = "cuda";
 
 DEFINE_validator(device,
                  [](const char *, const std::string &value) { return value == kDeviceCPU || value == kDeviceCUDA; });
+DEFINE_bool(flash, false, "Whether to enable flash attention");
 
 int main(int argc, char *argv[]) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
diff --git a/image-1.png b/image-1.png
new file mode 100644
index 00000000..8cddbc4f
Binary files /dev/null and b/image-1.png differ
diff --git a/image-2.png b/image-2.png
new file mode 100644
index 00000000..41e7a99c
Binary files /dev/null and b/image-2.png differ
diff --git a/image-3.png b/image-3.png
new file mode 100644
index 00000000..58b53abf
Binary files /dev/null and b/image-3.png differ
diff --git a/image-4.png b/image-4.png
new file mode 100644
index 00000000..220f11f3
Binary files /dev/null and b/image-4.png differ
diff --git a/image-5.png b/image-5.png
new file mode 100644
index 00000000..77f6eb92
Binary files /dev/null and b/image-5.png differ
diff --git a/image.png b/image.png
new file mode 100644
index 00000000..cf3fa348
Binary files /dev/null and b/image.png differ
diff --git a/infini_train/include/autocast.h b/infini_train/include/autocast.h
index 499c586f..a10a084c 100644
--- a/infini_train/include/autocast.h
+++ b/infini_train/include/autocast.h
@@ -48,7 +48,7 @@ enum class CastPolicy : uint8_t {
 };
 
 // Cast-policy maps and their associated operations. The op names should match the ones defined in the op registry.
-inline constexpr std::array kLowerPrecisionOps = {"Matmul", "Linear"};
+inline constexpr std::array kLowerPrecisionOps = {"Matmul"};
 inline constexpr std::array kFP32Ops
     = {"Sin",      "Cos",        "Tan",   "Asin",  "Acos",  "Atan",         "Sinh",
        "Cosh",     "Tanh",       "Asinh", "Acosh", "Atanh", "Exp",          "Log",
@@ -59,7 +59,7 @@ inline constexpr std::array kFP32Ops
 // op names should match the ones defined in the op registry.
 inline const std::unordered_map<std::string_view, CastPolicy> kOpCastPolicyMap = {
     {"Matmul", CastPolicy::kLowerPrecision},
-    {"Linear", CastPolicy::kLowerPrecision},
+    {"Linear", CastPolicy::kFP32},
     {"Sin", CastPolicy::kFP32},
     {"Cos", CastPolicy::kFP32},
     {"Tan", CastPolicy::kFP32},
diff --git a/infini_train/include/autograd/scaled_dot_product_attention.h b/infini_train/include/autograd/scaled_dot_product_attention.h
new file mode 100644
index 00000000..e48f900a
--- /dev/null
+++ b/infini_train/include/autograd/scaled_dot_product_attention.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "infini_train/include/autograd/function.h"
+
+namespace infini_train {
+class Tensor;
+}
+
+namespace infini_train::autograd {
+
+class ScaledDotProductAttention : public Function {
+public:
+    static constexpr char kType[] = "ScaledDotProductAttention";
+
+    ScaledDotProductAttention(double dropout_p, bool is_causal,
+                              std::optional<double> scale, bool enable_gqa)
+        : Function(kType), dropout_p_(dropout_p), is_causal_(is_causal), scale_(scale),
+          enable_gqa_(enable_gqa) {}
+
+    std::vector<std::shared_ptr<Tensor>> Forward(
+        const std::vector<std::shared_ptr<Tensor>> &input_tensors) override;
+
+    void SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
+                      const std::vector<std::shared_ptr<Tensor>> &output_tensors) override;
+
+    std::vector<std::shared_ptr<Tensor>> Backward(
+        const std::vector<std::shared_ptr<Tensor>> &grad_outputs) override;
+
+private:
+    double dropout_p_ = 0.0;
+    
+    bool is_causal_ = false;
+    std::optional<double> scale_ = std::nullopt;
+    bool enable_gqa_ = false;
+    bool has_attn_mask_input_ = false;
+    std::shared_ptr<Tensor> forward_out_ = nullptr;
+    std::shared_ptr<Tensor> forward_lse_ = nullptr;
+    // Saved tensors for backward can be managed via Function's SaveForBackward helper
+};
+} // namespace infini_train::autograd
diff --git a/infini_train/include/common/common.h b/infini_train/include/common/common.h
index b6a02543..ea04c790 100644
--- a/infini_train/include/common/common.h
+++ b/infini_train/include/common/common.h
@@ -13,6 +13,8 @@
     LOG_LOC(FATAL, WRAP(CONTEXT_IDENTIFIER << ": Unsupported data type: "                                              \
                                                   + kDataTypeToDesc.at(static_cast<infini_train::DataType>(dtype))))
 
+
+//compute strides for a given shape.
 inline std::vector<int64_t> ComputeStrides(const std::vector<int64_t> &dims) {
     std::vector<int64_t> strides(dims.size(), 1);
     for (int i = dims.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * dims[i + 1]; }
diff --git a/infini_train/include/nn/functional.h b/infini_train/include/nn/functional.h
index e4354fd1..ba92b981 100644
--- a/infini_train/include/nn/functional.h
+++ b/infini_train/include/nn/functional.h
@@ -2,6 +2,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <vector>
 
 namespace infini_train {
@@ -162,6 +163,19 @@ std::shared_ptr<Tensor> Softmax(const std::shared_ptr<Tensor> &input, int64_t di
 std::shared_ptr<Tensor> Slice(const std::shared_ptr<Tensor> &input, const std::vector<int64_t> &starts,
                               const std::vector<int64_t> &ends, const std::vector<int64_t> &steps);
 
+// Scaled dot-product attention interface matching PyTorch's scaled_dot_product_attention.
+// - query, key, value: tensors with shape (..., seq_len, head_dim)
+// - attn_mask: optional additive mask (same broadcasting semantics as PyTorch)
+// - dropout_p: dropout probability (0.0 disables)
+// - is_causal: whether to apply causal mask
+// - scale: optional scale factor; if not provided, use 1/sqrt(head_dim)
+// - enable_gqa: grouped query attention flag
+std::shared_ptr<Tensor> ScaledDotProductAttention(
+    const std::shared_ptr<Tensor> &query, const std::shared_ptr<Tensor> &key,
+    const std::shared_ptr<Tensor> &value, const std::shared_ptr<Tensor> &attn_mask = nullptr,
+    double dropout_p = 0.0, bool is_causal = false,
+    const std::optional<double> &scale = std::nullopt, bool enable_gqa = false);
+
 // Concatenates a sequence of tensors along a new dimension.
 //
 // Args:
diff --git a/infini_train/include/nn/parallel/ddp/distributed_optimizer.h b/infini_train/include/nn/parallel/ddp/distributed_optimizer.h
index bc31442e..81f640fc 100644
--- a/infini_train/include/nn/parallel/ddp/distributed_optimizer.h
+++ b/infini_train/include/nn/parallel/ddp/distributed_optimizer.h
@@ -31,6 +31,9 @@ class DistributedOptimizer final : public infini_train::Optimizer {
     void StartGradSync();
     void FinishGradSync();
 
+    // Forward microbatch boundary info to bucket groups.
+    void SetIsLastMicrobatch(bool is_last_microbatch);
+
     void StartParamSync(bool force_sync = false);
     void FinishParamSync(bool skip_next_bucket_dispatch = false);
 
diff --git a/infini_train/include/nn/parallel/ddp/param_and_grad_buffer.h b/infini_train/include/nn/parallel/ddp/param_and_grad_buffer.h
index c83fe9a5..b4a2aa9d 100644
--- a/infini_train/include/nn/parallel/ddp/param_and_grad_buffer.h
+++ b/infini_train/include/nn/parallel/ddp/param_and_grad_buffer.h
@@ -70,6 +70,9 @@ class ParamAndGradBucketGroup {
     // When all params in a bucket group are ready, will call StartGradSync()
     void RegisterGradReady(const std::shared_ptr<Tensor> &parameter);
 
+    // Mark whether current backward corresponds to the last microbatch in a gradient accumulation window.
+    void SetIsLastMicrobatch(bool is_last_microbatch);
+
     // Start grad reduce
     void StartGradSync();
 
diff --git a/infini_train/src/autograd/function.cc b/infini_train/src/autograd/function.cc
index 42a95729..ff2cad58 100644
--- a/infini_train/src/autograd/function.cc
+++ b/infini_train/src/autograd/function.cc
@@ -18,6 +18,7 @@ namespace infini_train::autograd {
 std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shared_ptr<Tensor>> &input_tensors) {
     CHECK_GE(input_tensors.size(), 1);
     auto device = input_tensors[0]->GetDevice();
+    // *：Switch to the device where the input tensor is located
     core::DeviceGuard guard(device);
 
     // Register precision check hooks if enabled (before forward)
@@ -29,13 +30,15 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
         }
     }
 
+
     // Call forward pre-hooks
     for (const auto &hook : forward_pre_hooks_) {
         if (hook) {
             hook(this, input_tensors);
         }
     }
-
+    
+    //调用前向函数，得到输出张量
     std::vector<std::shared_ptr<Tensor>> output_tensors;
     {
         autograd::NoGradGuard no_grad;
@@ -78,6 +81,7 @@ std::vector<std::shared_ptr<Tensor>> Function::Apply(const std::vector<std::shar
         // TODO(dcj): Mark if an output tensor need differentiable or not.
         output_tensor->set_requires_grad(output_requires_grad);
         output_tensor->set_grad_fn(output_requires_grad ? shared_from_this() : nullptr);
+        //条件二含义：需要梯度，但是它没有生父算子（即它是用户手动创建的原始参数，不是算出来的）。
         output_tensor->set_is_leaf(!output_requires_grad
                                    || ((output_tensor->grad_fn() == nullptr) && output_requires_grad));
         output_tensor->set_output_idx(output_idx);
diff --git a/infini_train/src/autograd/scaled_dot_product_attention.cc b/infini_train/src/autograd/scaled_dot_product_attention.cc
new file mode 100644
index 00000000..e15b1579
--- /dev/null
+++ b/infini_train/src/autograd/scaled_dot_product_attention.cc
@@ -0,0 +1,75 @@
+#include "infini_train/include/autograd/scaled_dot_product_attention.h"
+
+#include "glog/logging.h"
+
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train::autograd {
+
+std::vector<std::shared_ptr<Tensor>> ScaledDotProductAttention::Forward(
+    const std::vector<std::shared_ptr<Tensor>> &input_tensors) {
+    CHECK(input_tensors.size() == 3 || input_tensors.size() == 4);
+    const auto &q = input_tensors[0];
+    const auto &k = input_tensors[1];
+    const auto &v = input_tensors[2];
+    const auto mask = input_tensors.size() == 4 ? input_tensors[3] : nullptr;
+
+    auto device = q->GetDevice().type();
+    // Call device kernel. Kernel name: ScaledDotProductAttentionForward
+    auto out_and_lse = Dispatcher::Instance().Call<std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>>(
+        {device, "ScaledDotProductAttentionForward"}, q, k, v, mask, dropout_p_, is_causal_, scale_,
+        enable_gqa_);
+    forward_out_ = std::get<0>(out_and_lse);
+    forward_lse_ = std::get<1>(out_and_lse);
+    auto out = forward_out_;
+    return {out};
+}
+
+void ScaledDotProductAttention::SetupContext(const std::vector<std::shared_ptr<Tensor>> &input_tensors,
+                                             const std::vector<std::shared_ptr<Tensor>> &output_tensors) {
+    (void)output_tensors;
+    // Save q,k,v and mask (mask may be nullptr)
+    const auto &q = input_tensors[0];
+    const auto &k = input_tensors[1];
+    const auto &v = input_tensors[2];
+    std::shared_ptr<Tensor> mask = nullptr;
+    has_attn_mask_input_ = (input_tensors.size() == 4);
+    if (input_tensors.size() == 4) {
+        mask = input_tensors[3];
+    }
+    saved_tensors_ = {q, k, v, mask};
+}
+
+std::vector<std::shared_ptr<Tensor>> ScaledDotProductAttention::Backward(
+    const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
+    CHECK(saved_tensors_.size() == 4);
+    const auto &q = saved_tensors_[0];
+    const auto &k = saved_tensors_[1];
+    const auto &v = saved_tensors_[2];
+    const auto &mask = saved_tensors_[3];
+
+    CHECK_EQ(grad_outputs.size(), 1);
+    const auto &grad_output = grad_outputs[0];
+
+    auto device = grad_output->GetDevice().type();
+
+    CHECK(forward_out_ != nullptr);
+    CHECK(forward_lse_ != nullptr);
+
+    auto grads = Dispatcher::Instance().Call<std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>,
+                                                       std::shared_ptr<Tensor>>>(
+        {device, "ScaledDotProductAttentionBackward"}, grad_output, q, k, v, mask, forward_out_, forward_lse_,
+        dropout_p_, is_causal_, scale_, enable_gqa_);
+
+    forward_out_ = nullptr;
+    forward_lse_ = nullptr;
+
+    if (has_attn_mask_input_) {
+        return {std::get<0>(grads), std::get<1>(grads), std::get<2>(grads), nullptr};
+    }
+
+    return {std::get<0>(grads), std::get<1>(grads), std::get<2>(grads)};
+}
+
+} // namespace infini_train::autograd
diff --git a/infini_train/src/kernels/cuda/flash_attention.cu b/infini_train/src/kernels/cuda/flash_attention.cu
new file mode 100644
index 00000000..d12fa6a3
--- /dev/null
+++ b/infini_train/src/kernels/cuda/flash_attention.cu
@@ -0,0 +1,564 @@
+#include <functional>
+#include <numeric>
+#include <vector>
+#include <optional>
+#include <tuple>
+#include <memory>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <iostream>
+#include <stdexcept>
+#include <unordered_map>
+
+#include <cub/block/block_reduce.cuh>
+#include <cublas_v2.h>
+#include <cudnn.h>
+
+#include "infini_train/include/common/cuda/common_cuda.h"
+#include "infini_train/include/common/cuda/kernel_helper.cuh"
+#include "infini_train/include/core/runtime/device_guard.h"
+#include "infini_train/include/dispatcher.h"
+#include "infini_train/include/tensor.h"
+
+#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h"
+//#include "infini_train/src/core/cuda/cuda_stream.h"
+#include "infini_train/include/common/common.h"   // ComputeStrides
+#include <cuda_runtime.h>  // cudaStream_t
+
+// 强烈建议使用 NVIDIA 提供的 frontend 库，否则原始 API 会写到手软
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+
+namespace infini_train::kernels::cuda {
+
+namespace {
+constexpr int64_t Q_UID     = 101;
+constexpr int64_t K_UID     = 102;
+constexpr int64_t V_UID     = 103;
+constexpr int64_t MASK_UID  = 104;
+constexpr int64_t O_UID     = 201;
+constexpr int64_t STATS_UID = 202;
+
+constexpr int64_t dO_UID = 301;
+constexpr int64_t dQ_UID = 401;
+constexpr int64_t dK_UID = 402;
+constexpr int64_t dV_UID = 403;
+
+struct WorkspaceCache {
+    void *ptr = nullptr;
+    size_t size = 0;
+};
+
+static inline std::size_t hash_combine(std::size_t seed, std::size_t v) {
+    return seed ^ (v + 0x9e3779b97f4a7c15ULL + (seed << 6U) + (seed >> 2U));
+}
+
+static inline uint32_t float_to_bits(float x) {
+    uint32_t bits;
+    std::memcpy(&bits, &x, sizeof(float));
+    return bits;
+}
+
+static std::size_t hash_dims(std::vector<int64_t> const &dims) {
+    std::size_t h = 0;
+    for (auto d : dims) {
+        h = hash_combine(h, std::hash<int64_t>{}(d));
+    }
+    return h;
+}
+
+struct FwdPlanKey {
+    std::vector<int64_t> q_dims;
+    std::vector<int64_t> k_dims;
+    std::vector<int64_t> v_dims;
+    std::vector<int64_t> mask_dims;
+    int dtype = 0;
+    bool is_causal = false;
+    bool has_mask = false;
+    uint32_t attn_scale_bits = 0;
+
+    bool operator==(FwdPlanKey const &other) const {
+        return q_dims == other.q_dims &&
+               k_dims == other.k_dims &&
+               v_dims == other.v_dims &&
+               mask_dims == other.mask_dims &&
+               dtype == other.dtype &&
+               is_causal == other.is_causal &&
+               has_mask == other.has_mask &&
+               attn_scale_bits == other.attn_scale_bits;
+    }
+};
+
+struct FwdPlanKeyHash {
+    std::size_t operator()(FwdPlanKey const &k) const {
+        std::size_t h = 0;
+        h = hash_combine(h, hash_dims(k.q_dims));
+        h = hash_combine(h, hash_dims(k.k_dims));
+        h = hash_combine(h, hash_dims(k.v_dims));
+        h = hash_combine(h, hash_dims(k.mask_dims));
+        h = hash_combine(h, std::hash<int>{}(k.dtype));
+        h = hash_combine(h, std::hash<bool>{}(k.is_causal));
+        h = hash_combine(h, std::hash<bool>{}(k.has_mask));
+        h = hash_combine(h, std::hash<uint32_t>{}(k.attn_scale_bits));
+        return h;
+    }
+};
+
+struct BwdPlanKey {
+    std::vector<int64_t> q_dims;
+    std::vector<int64_t> k_dims;
+    std::vector<int64_t> v_dims;
+    std::vector<int64_t> o_dims;
+    std::vector<int64_t> do_dims;
+    std::vector<int64_t> lse_dims;
+    std::vector<int64_t> mask_dims;
+    int dtype = 0;
+    bool is_causal = false;
+    bool has_mask = false;
+    uint32_t attn_scale_bits = 0;
+
+    bool operator==(BwdPlanKey const &other) const {
+        return q_dims == other.q_dims &&
+               k_dims == other.k_dims &&
+               v_dims == other.v_dims &&
+               o_dims == other.o_dims &&
+               do_dims == other.do_dims &&
+               lse_dims == other.lse_dims &&
+               mask_dims == other.mask_dims &&
+               dtype == other.dtype &&
+               is_causal == other.is_causal &&
+               has_mask == other.has_mask &&
+               attn_scale_bits == other.attn_scale_bits;
+    }
+};
+
+struct BwdPlanKeyHash {
+    std::size_t operator()(BwdPlanKey const &k) const {
+        std::size_t h = 0;
+        h = hash_combine(h, hash_dims(k.q_dims));
+        h = hash_combine(h, hash_dims(k.k_dims));
+        h = hash_combine(h, hash_dims(k.v_dims));
+        h = hash_combine(h, hash_dims(k.o_dims));
+        h = hash_combine(h, hash_dims(k.do_dims));
+        h = hash_combine(h, hash_dims(k.lse_dims));
+        h = hash_combine(h, hash_dims(k.mask_dims));
+        h = hash_combine(h, std::hash<int>{}(k.dtype));
+        h = hash_combine(h, std::hash<bool>{}(k.is_causal));
+        h = hash_combine(h, std::hash<bool>{}(k.has_mask));
+        h = hash_combine(h, std::hash<uint32_t>{}(k.attn_scale_bits));
+        return h;
+    }
+};
+
+struct CachedPlan {
+    std::shared_ptr<fe::graph::Graph> graph;
+    int64_t workspace_size = 0;
+};
+
+using FwdPlanCache = std::unordered_map<FwdPlanKey, CachedPlan, FwdPlanKeyHash>;
+using BwdPlanCache = std::unordered_map<BwdPlanKey, CachedPlan, BwdPlanKeyHash>;
+}
+
+// helpers for cuDNN frontend path
+static cudaStream_t get_cuda_stream(const ::infini_train::Device &device) {
+    auto impl = ::infini_train::core::GetDeviceGuardImpl(device.type());
+    auto stream_obj = impl->GetStream(device);
+    auto cuda_stream = dynamic_cast<infini_train::core::cuda::CudaStream *>(stream_obj)->cuda_stream();
+    return cuda_stream;
+}
+
+static cudnnHandle_t get_cudnn_handle(const ::infini_train::Device &device) {
+    //用来记录现在thread正在使用哪个cuda device，cudnn handle是和device绑定的，所以需要这个信息
+    int cuda_device = 0;
+    CUDA_CHECK(cudaGetDevice(&cuda_device));
+
+    static thread_local std::unordered_map<int, cudnnHandle_t> handles;
+    auto it = handles.find(cuda_device);
+    if (it == handles.end()) {
+        cudnnHandle_t handle;
+        cudnnCreate(&handle);
+        it = handles.emplace(cuda_device, handle).first;
+    }
+
+    auto cuda_stream = get_cuda_stream(device);
+    cudnnSetStream(it->second, cuda_stream);
+
+    return it->second;
+}
+
+static void *acquire_workspace(WorkspaceCache &cache, size_t requested_bytes) {
+    if (requested_bytes == 0) {
+        return nullptr;
+    }
+    if (cache.ptr == nullptr || cache.size < requested_bytes) {
+        if (cache.ptr != nullptr) {
+            CUDA_CHECK(cudaFree(cache.ptr));
+        }
+        CUDA_CHECK(cudaMalloc(&cache.ptr, requested_bytes));
+        cache.size = requested_bytes;
+    }
+    return cache.ptr;
+}
+
+static WorkspaceCache &forward_workspace_cache() {
+    static thread_local WorkspaceCache cache;
+    return cache;
+}
+
+static WorkspaceCache &backward_workspace_cache() {
+    static thread_local WorkspaceCache cache;
+    return cache;
+}
+
+static FwdPlanCache &forward_plan_cache() {
+    static thread_local FwdPlanCache cache;
+    return cache;
+}
+
+static BwdPlanCache &backward_plan_cache() {
+    static thread_local BwdPlanCache cache;
+    return cache;
+}
+
+static fe::DataType_t get_cudnn_dtype(const ::infini_train::DataType dtype);
+static std::shared_ptr<fe::graph::Tensor_attributes> make_graph_tensor(
+    const std::shared_ptr<fe::graph::Graph> &graph,
+    const std::shared_ptr<Tensor> &tensor,
+    const std::string &name,
+    int64_t uid);
+static void check_fe_status(fe::error_t status, const char *stage);
+static CachedPlan const &get_or_create_fwd_plan(const std::shared_ptr<Tensor> &q,
+                                                const std::shared_ptr<Tensor> &k,
+                                                const std::shared_ptr<Tensor> &v,
+                                                const std::shared_ptr<Tensor> &attn_mask,
+                                                bool is_causal,
+                                                float attn_scale,
+                                                cudnnHandle_t handle);
+static CachedPlan const &get_or_create_bwd_plan(const std::shared_ptr<Tensor> &grad_out,
+                                                const std::shared_ptr<Tensor> &q,
+                                                const std::shared_ptr<Tensor> &k,
+                                                const std::shared_ptr<Tensor> &v,
+                                                const std::shared_ptr<Tensor> &attn_mask,
+                                                const std::shared_ptr<Tensor> &out,
+                                                const std::shared_ptr<Tensor> &lse,
+                                                bool is_causal,
+                                                float attn_scale,
+                                                cudnnHandle_t handle);
+
+static std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> ExecuteSdpaForwardWithLse(
+        const std::shared_ptr<Tensor> &q,
+        const std::shared_ptr<Tensor> &k,
+        const std::shared_ptr<Tensor> &v,
+        const std::shared_ptr<Tensor> &attn_mask,
+        double dropout_p,
+        bool is_causal,
+        std::optional<double> scale,
+        bool /*enable_gqa*/) {
+    if (dropout_p > 0.0) {
+        throw std::runtime_error("cuDNN frontend SDPA path currently does not support dropout in this minimal kernel");
+    }
+
+    auto out = std::make_shared<Tensor>(q->Dims(), q->Dtype(), q->GetDevice());
+
+    auto q_dims = q->Dims();
+    CHECK_EQ(q_dims.size(), 4) << "SDPA expects 4D Q/K/V tensor layout [B, H, S, D]";
+    std::vector<int64_t> lse_dims = {q_dims[0], q_dims[1], q_dims[2], 1};
+    //lse(Log-sum-exp)
+    auto lse = std::make_shared<Tensor>(lse_dims, DataType::kFLOAT32, q->GetDevice());
+
+    cudnnHandle_t handle = get_cudnn_handle(q->GetDevice());
+
+    float attn_scale = scale.has_value() ? static_cast<float>(scale.value())
+                                          : 1.0f / std::sqrt(static_cast<float>(q->Dims().back()));
+
+    auto const &plan = get_or_create_fwd_plan(q, k, v, attn_mask, is_causal, attn_scale, handle);
+    void *workspace = acquire_workspace(forward_workspace_cache(), static_cast<size_t>(plan.workspace_size));
+
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {Q_UID, q->DataPtr()},
+        {K_UID, k->DataPtr()},
+        {V_UID, v->DataPtr()},
+        {O_UID, out->DataPtr()},
+        {STATS_UID, lse->DataPtr()},
+    };
+    if (attn_mask) {
+        variant_pack[MASK_UID] = attn_mask->DataPtr();
+    }
+
+    auto exec_status = plan.graph->execute(handle, variant_pack, workspace);
+    check_fe_status(exec_status, "graph->execute");
+
+    return {out, lse};
+}
+
+static fe::DataType_t get_cudnn_dtype(const ::infini_train::DataType dtype) {
+    switch (dtype) {
+        case ::infini_train::DataType::kFLOAT32:
+            return fe::DataType_t::FLOAT;
+        case ::infini_train::DataType::kFLOAT16:
+            return fe::DataType_t::HALF;
+        case ::infini_train::DataType::kBFLOAT16:
+            return fe::DataType_t::BFLOAT16;
+        default:
+            throw std::runtime_error("unsupported dtype for cuDNN SDP");
+    }
+}
+
+static std::shared_ptr<fe::graph::Tensor_attributes> make_graph_tensor(
+    const std::shared_ptr<fe::graph::Graph> &graph,
+    const std::shared_ptr<Tensor> &tensor,
+    const std::string &name,
+    int64_t uid) {
+    return graph->tensor(fe::graph::Tensor_attributes()
+                             .set_name(name)
+                             .set_uid(uid)
+                             .set_dim(tensor->Dims())
+                             .set_stride(ComputeStrides(tensor->Dims()))
+                             .set_data_type(get_cudnn_dtype(tensor->Dtype())));
+}
+
+static void check_fe_status(fe::error_t status, const char *stage) {
+    if (status.is_bad()) {
+        throw std::runtime_error(std::string(stage) + ": " + status.get_message());
+    }
+}
+
+static CachedPlan const &get_or_create_fwd_plan(const std::shared_ptr<Tensor> &q,
+                                                const std::shared_ptr<Tensor> &k,
+                                                const std::shared_ptr<Tensor> &v,
+                                                const std::shared_ptr<Tensor> &attn_mask,
+                                                bool is_causal,
+                                                float attn_scale,
+                                                cudnnHandle_t handle) {
+    FwdPlanKey key;
+    key.q_dims = q->Dims();
+    key.k_dims = k->Dims();
+    key.v_dims = v->Dims();
+    key.has_mask = (attn_mask != nullptr);
+    if (attn_mask) {
+        key.mask_dims = attn_mask->Dims();
+    }
+    key.dtype = static_cast<int>(q->Dtype());
+    key.is_causal = is_causal;
+    key.attn_scale_bits = float_to_bits(attn_scale);
+
+    //cache ——FwdPlanCache::map<FwdPlanKey, CachedPlan, FwdPlanKeyHash>，根据key查找是否已经存在对应的plan，如果存在就直接返回，如果不存在就创建新的plan并插入cache
+    auto &cache = forward_plan_cache();
+    auto it = cache.find(key);
+    //若能直接找到就返回对应的plan,优化速度
+    if (it != cache.end()) {
+        return it->second;
+    }
+    
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(get_cudnn_dtype(q->Dtype()))
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto q_tensor = make_graph_tensor(graph, q, "Q", Q_UID);
+    auto k_tensor = make_graph_tensor(graph, k, "K", K_UID);
+    auto v_tensor = make_graph_tensor(graph, v, "V", V_UID);
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(true)
+                            .set_attn_scale(attn_scale);
+
+    if (is_causal) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (attn_mask) {
+        auto mask_tensor = make_graph_tensor(graph, attn_mask, "Bias", MASK_UID);
+        sdpa_options.set_bias(mask_tensor);
+    }
+
+    auto [out_tensor, stats_tensor] = graph->sdpa(q_tensor, k_tensor, v_tensor, sdpa_options);
+    out_tensor->set_output(true)
+        .set_uid(O_UID)
+        .set_dim(q->Dims())
+        .set_stride(ComputeStrides(q->Dims()));
+    std::vector<int64_t> lse_dims = {q->Dims()[0], q->Dims()[1], q->Dims()[2], 1};
+    stats_tensor->set_output(true)
+        .set_uid(STATS_UID)
+        .set_dim(lse_dims)
+        .set_stride(ComputeStrides(lse_dims))
+        .set_data_type(fe::DataType_t::FLOAT);
+
+    check_fe_status(graph->build(handle, {fe::HeurMode_t::A}), "graph->build (fwd cache build)");
+
+    int64_t workspace_size = 0;
+    check_fe_status(graph->get_workspace_size(workspace_size), "graph->get_workspace_size (fwd cache build)");
+
+    CachedPlan plan;
+    plan.graph = graph;
+    plan.workspace_size = workspace_size;
+    auto inserted = cache.emplace(std::move(key), std::move(plan));
+    return inserted.first->second;
+}
+
+static CachedPlan const &get_or_create_bwd_plan(const std::shared_ptr<Tensor> &grad_out,
+                                                const std::shared_ptr<Tensor> &q,
+                                                const std::shared_ptr<Tensor> &k,
+                                                const std::shared_ptr<Tensor> &v,
+                                                const std::shared_ptr<Tensor> &attn_mask,
+                                                const std::shared_ptr<Tensor> &out,
+                                                const std::shared_ptr<Tensor> &lse,
+                                                bool is_causal,
+                                                float attn_scale,
+                                                cudnnHandle_t handle) {
+    BwdPlanKey key;
+    key.q_dims = q->Dims();
+    key.k_dims = k->Dims();
+    key.v_dims = v->Dims();
+    key.o_dims = out->Dims();
+    key.do_dims = grad_out->Dims();
+    key.lse_dims = lse->Dims();
+    key.has_mask = (attn_mask != nullptr);
+    if (attn_mask) {
+        key.mask_dims = attn_mask->Dims();
+    }
+    key.dtype = static_cast<int>(q->Dtype());
+    key.is_causal = is_causal;
+    key.attn_scale_bits = float_to_bits(attn_scale);
+
+    auto &cache = backward_plan_cache();
+    auto it = cache.find(key);
+    if (it != cache.end()) {
+        return it->second;
+    }
+
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(get_cudnn_dtype(q->Dtype()))
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto q_tensor = make_graph_tensor(graph, q, "Q", Q_UID);
+    auto k_tensor = make_graph_tensor(graph, k, "K", K_UID);
+    auto v_tensor = make_graph_tensor(graph, v, "V", V_UID);
+    auto o_tensor = make_graph_tensor(graph, out, "O", O_UID);
+    auto dO_tensor = make_graph_tensor(graph, grad_out, "dO", dO_UID);
+    auto lse_tensor = make_graph_tensor(graph, lse, "Stats", STATS_UID);
+
+    auto sdpa_bwd_options = fe::graph::SDPA_backward_attributes()
+                                .set_name("flash_attention_backward")
+                                .set_attn_scale(attn_scale)
+                                .set_deterministic_algorithm(true);
+
+    if (is_causal) {
+        sdpa_bwd_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (attn_mask) {
+        auto mask_tensor = make_graph_tensor(graph, attn_mask, "Bias", MASK_UID);
+        sdpa_bwd_options.set_bias(mask_tensor);
+    }
+
+    auto [dQ_tensor, dK_tensor, dV_tensor] = graph->sdpa_backward(
+        q_tensor, k_tensor, v_tensor, o_tensor, dO_tensor, lse_tensor, sdpa_bwd_options);
+
+    dQ_tensor->set_output(true)
+        .set_uid(dQ_UID)
+        .set_dim(q->Dims())
+        .set_stride(ComputeStrides(q->Dims()));
+    dK_tensor->set_output(true)
+        .set_uid(dK_UID)
+        .set_dim(k->Dims())
+        .set_stride(ComputeStrides(k->Dims()));
+    dV_tensor->set_output(true)
+        .set_uid(dV_UID)
+        .set_dim(v->Dims())
+        .set_stride(ComputeStrides(v->Dims()));
+
+    check_fe_status(graph->build(handle, {fe::HeurMode_t::A}), "graph->build (bwd cache build)");
+
+    int64_t workspace_size = 0;
+    check_fe_status(graph->get_workspace_size(workspace_size), "graph->get_workspace_size (bwd cache build)");
+
+    CachedPlan plan;
+    plan.graph = graph;
+    plan.workspace_size = workspace_size;
+    auto inserted = cache.emplace(std::move(key), std::move(plan));
+    return inserted.first->second;
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>> ScaledDotProductAttentionForward(
+        const std::shared_ptr<Tensor> &q,
+        const std::shared_ptr<Tensor> &k,
+        const std::shared_ptr<Tensor> &v,
+        const std::shared_ptr<Tensor> &attn_mask,
+        double dropout_p,
+        bool is_causal,
+        std::optional<double> scale,
+        bool enable_gqa) {
+    return ExecuteSdpaForwardWithLse(q, k, v, attn_mask, dropout_p, is_causal, scale, enable_gqa);
+}
+
+std::tuple<std::shared_ptr<Tensor>, std::shared_ptr<Tensor>, std::shared_ptr<Tensor>>
+ScaledDotProductAttentionBackward(
+        const std::shared_ptr<Tensor> &grad_out,
+        const std::shared_ptr<Tensor> &q,
+        const std::shared_ptr<Tensor> &k,
+        const std::shared_ptr<Tensor> &v,
+        const std::shared_ptr<Tensor> &attn_mask,
+        const std::shared_ptr<Tensor> &out,
+        const std::shared_ptr<Tensor> &lse,
+        double dropout_p,
+        bool is_causal,
+        std::optional<double> scale,
+        bool enable_gqa) {
+
+    auto dq = std::make_shared<Tensor>(q->Dims(), q->Dtype(), q->GetDevice());
+    auto dk = std::make_shared<Tensor>(k->Dims(), k->Dtype(), k->GetDevice());
+    auto dv = std::make_shared<Tensor>(v->Dims(), v->Dtype(), v->GetDevice());
+
+    if (dropout_p > 0.0) {
+        throw std::runtime_error("cuDNN frontend SDPA path currently does not support dropout in this minimal kernel");
+    }
+    (void)enable_gqa;
+
+
+    // ---------- cuDNN frontend implementation ----------
+    cudnnHandle_t handle = get_cudnn_handle(grad_out->GetDevice());
+
+    float attn_scale = scale.has_value() ? static_cast<float>(scale.value())
+                                          : 1.0f / std::sqrt(static_cast<float>(q->Dims().back()));
+
+    auto const &plan = get_or_create_bwd_plan(grad_out, q, k, v, attn_mask, out, lse, is_causal, attn_scale, handle);
+    void *workspace = acquire_workspace(backward_workspace_cache(), static_cast<size_t>(plan.workspace_size));
+
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {Q_UID, q->DataPtr()},
+        {K_UID, k->DataPtr()},
+        {V_UID, v->DataPtr()},
+        {O_UID, out->DataPtr()},
+        {dO_UID, grad_out->DataPtr()},
+        {STATS_UID, lse->DataPtr()},
+        {dQ_UID, dq->DataPtr()},
+        {dK_UID, dk->DataPtr()},
+        {dV_UID, dv->DataPtr()},
+    };
+    if (attn_mask) {
+        variant_pack[MASK_UID] = attn_mask->DataPtr();
+    }
+
+    auto exec_status = plan.graph->execute(handle, variant_pack, workspace);
+    check_fe_status(exec_status, "graph->execute (backward)");
+
+    return {dq, dk, dv};
+}
+
+}
+
+
+#define REGISTER_CUDA_LINEAR_KERNEL(kernel_name)                                                                       \
+    REGISTER_KERNEL(infini_train::Device::DeviceType::kCUDA, kernel_name, infini_train::kernels::cuda::kernel_name)
+
+REGISTER_CUDA_LINEAR_KERNEL(ScaledDotProductAttentionBackward)
+REGISTER_CUDA_LINEAR_KERNEL(ScaledDotProductAttentionForward)
+
+#undef REGISTER_CUDA_LINEAR_KERNEL
diff --git a/infini_train/src/kernels/cuda/no_op.cu b/infini_train/src/kernels/cuda/no_op.cu
index ef2c9566..9b8b04e3 100644
--- a/infini_train/src/kernels/cuda/no_op.cu
+++ b/infini_train/src/kernels/cuda/no_op.cu
@@ -3,6 +3,8 @@
 #include "infini_train/include/dispatcher.h"
 #include "infini_train/include/tensor.h"
 
+
+//这里不用用到并行算法，直接把输入张量的视图返回即可
 namespace infini_train::kernels::cuda {
 std::shared_ptr<Tensor> NoOpForward(const std::shared_ptr<Tensor> &input, const std::vector<int64_t> &dims) {
     const int64_t num_elements = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int64_t>());
diff --git a/infini_train/src/kernels/cuda/ysyx.code-workspace b/infini_train/src/kernels/cuda/ysyx.code-workspace
new file mode 100644
index 00000000..afc35437
--- /dev/null
+++ b/infini_train/src/kernels/cuda/ysyx.code-workspace
@@ -0,0 +1,8 @@
+{
+	"folders": [
+		{
+			"path": "../../../../.."
+		}
+	],
+	"settings": {}
+}
\ No newline at end of file
diff --git a/infini_train/src/nn/functional.cc b/infini_train/src/nn/functional.cc
index b02f185a..a159f6b9 100644
--- a/infini_train/src/nn/functional.cc
+++ b/infini_train/src/nn/functional.cc
@@ -3,6 +3,9 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
+#include <cmath>
+#include <limits>
+#include <optional>
 
 #include "infini_train/include/autograd/activations.h"
 #include "infini_train/include/autograd/elementwise.h"
@@ -10,6 +13,7 @@
 #include "infini_train/include/autograd/reduction.h"
 #include "infini_train/include/autograd/softmax.h"
 #include "infini_train/include/autograd/transform.h"
+#include "infini_train/include/autograd/scaled_dot_product_attention.h"
 #include "infini_train/include/nn/init.h"
 #include "infini_train/include/tensor.h"
 
@@ -79,4 +83,22 @@ std::shared_ptr<Tensor> Softmax(const std::shared_ptr<Tensor> &input, int64_t di
 std::shared_ptr<Tensor> Sigmoid(const std::shared_ptr<Tensor> &input) {
     return std::make_shared<autograd::Sigmoid>()->Apply({input})[0];
 }
-} // namespace infini_train::nn::function
+
+std::shared_ptr<Tensor> ScaledDotProductAttention(
+        const std::shared_ptr<Tensor> &query,
+        const std::shared_ptr<Tensor> &key,
+        const std::shared_ptr<Tensor> &value,
+        const std::shared_ptr<Tensor> &attn_mask,
+        double dropout_p,
+        bool is_causal,
+        const std::optional<double> &scale,
+        bool enable_gqa) {
+    std::vector<std::shared_ptr<Tensor>> inputs = {query, key, value};
+    if (attn_mask) inputs.push_back(attn_mask);
+    auto fn = std::make_shared<autograd::ScaledDotProductAttention>(
+        dropout_p, is_causal, scale, enable_gqa);
+    return fn->Apply(inputs)[0];
+}
+
+}
+// namespace infini_train::nn::function
diff --git a/infini_train/src/nn/parallel/ddp/distributed_data_parallel.cc b/infini_train/src/nn/parallel/ddp/distributed_data_parallel.cc
index 002fe318..c8a4ee3b 100644
--- a/infini_train/src/nn/parallel/ddp/distributed_data_parallel.cc
+++ b/infini_train/src/nn/parallel/ddp/distributed_data_parallel.cc
@@ -2,6 +2,7 @@
 
 #include <map>
 #include <memory>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -24,7 +25,11 @@ DistributedDataParallel::DistributedDataParallel(std::shared_ptr<nn::Module> mod
                                                  const DistributedDataParallelConfig ddp_config)
     : ddp_config_(ddp_config),
       ddp_pg_(ProcessGroupFactory::Instance()->Get(GetDataParallelProcessGroupName(rank.GlobalRank()))) {
+    std::unordered_set<Tensor *> seen_params;
     for (auto &param : module->Parameters()) {
+        if (!param || !seen_params.insert(param.get()).second) {
+            continue;
+        }
         auto device = param->GetDevice();
         CHECK_EQ(device.index(), rank.thread_rank()) << "All parameters must be on the same device as the module";
         if (!ddp_config.gradient_bucketing_enabled && !ddp_config.use_distributed_optimizer) {
@@ -130,7 +135,11 @@ void DistributedDataParallel::RegisterBackwardHooks() {
     };
 
     auto &module = modules_.at(kModuleName);
+    std::unordered_set<Tensor *> seen_params;
     for (auto &param : module->Parameters()) {
+        if (!param || !seen_params.insert(param.get()).second) {
+            continue;
+        }
         if (!param->requires_grad()) {
             continue;
         }
diff --git a/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc b/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc
index 55e5800b..a96f58d7 100644
--- a/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc
+++ b/infini_train/src/nn/parallel/ddp/distributed_optimizer.cc
@@ -91,6 +91,10 @@ void DistributedOptimizer::FinishGradSync() {
     for (auto &group : bucket_groups_) { group->FinishGradSync(); }
 }
 
+void DistributedOptimizer::SetIsLastMicrobatch(bool is_last_microbatch) {
+    for (auto &group : bucket_groups_) { group->SetIsLastMicrobatch(is_last_microbatch); }
+}
+
 void DistributedOptimizer::StartParamSync(bool force_sync) {
     for (auto &group : bucket_groups_) { group->StartParamSync(force_sync); }
 }
diff --git a/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc b/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc
index 75a21f63..e2a1eccf 100644
--- a/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc
+++ b/infini_train/src/nn/parallel/ddp/param_and_grad_buffer.cc
@@ -147,6 +147,10 @@ void ParamAndGradBucketGroup::RegisterGradReady(const std::shared_ptr<Tensor> &p
     }
 }
 
+void ParamAndGradBucketGroup::SetIsLastMicrobatch(bool is_last_microbatch) {
+    is_last_microbatch_ = is_last_microbatch;
+}
+
 void ParamAndGradBucketGroup::StartGradSync() {
     if (!collective_pg_) {
         LOG(FATAL) << "ParamAndGradBucketGroup: StartGradSync() called with null collective_pg_.";
diff --git a/infini_train/src/optimizer.cc b/infini_train/src/optimizer.cc
index 2c9b218a..890c9c73 100644
--- a/infini_train/src/optimizer.cc
+++ b/infini_train/src/optimizer.cc
@@ -25,10 +25,14 @@ void SGD::Step() {
             LOG(INFO) << "Skipping param with null grad.";
             continue;
         }
+        auto grad = param->grad();
+        if (grad->Dtype() != param->Dtype()) {
+            grad = std::make_shared<Tensor>(grad->To(param->Dtype()));
+        }
         auto device = param->GetDevice();
         core::DeviceGuard guard(device);
         auto kernel = Dispatcher::Instance().GetKernel({device.type(), "AccumulateGrad"});
-        kernel.Call<void>(param->grad(), -learning_rate_, param);
+        kernel.Call<void>(grad, -learning_rate_, param);
     }
 }
 
@@ -53,11 +57,14 @@ void Adam::Step() {
 
     for (size_t i = 0; i < params_.size(); ++i) {
         auto &param = params_[i];
-        const auto &grad = param->grad();
+        auto grad = param->grad();
         if (!grad) {
             LOG(INFO) << "Skipping param with null grad.";
             continue;
         }
+        if (grad->Dtype() != param->Dtype()) {
+            grad = std::make_shared<Tensor>(grad->To(param->Dtype()));
+        }
         auto &m = m_[i];
         auto &v = v_[i];
 
diff --git a/infini_train/src/tensor.cc b/infini_train/src/tensor.cc
index 6c243fea..56da32e5 100644
--- a/infini_train/src/tensor.cc
+++ b/infini_train/src/tensor.cc
@@ -98,6 +98,7 @@ size_t Tensor::SizeInBytes() const { return kDataTypeToSize.at(dtype_) * num_ele
 
 const std::vector<int64_t> &Tensor::Dims() const { return dims_; }
 
+
 size_t Tensor::NumElements() const { return num_elements_; }
 
 DataType Tensor::Dtype() const { return dtype_; }
diff --git a/scripts/precision_check/precision_compare.py b/scripts/precision_check/precision_compare.py
deleted file mode 100755
index 40c91308..00000000
--- a/scripts/precision_check/precision_compare.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-"""
-Precision comparison tool for InfiniTrain tensor outputs.
-
-Usage:
-    python precision_compare.py --dir1 ./run1 --dir2 ./run2 [--atol 1e-5] [--rtol 1e-3]
-
-Compares .npy files between two directories and reports differences.
-"""
-
-import argparse
-import os
-import sys
-from pathlib import Path
-
-import numpy as np
-
-
-def find_npy_files(directory: str) -> dict[str, Path]:
-    """Find all .npy files in directory (recursively)."""
-    files = {}
-    for path in Path(directory).rglob("*.npy"):
-        rel_path = path.relative_to(directory)
-        files[str(rel_path)] = path
-    return files
-
-
-def compare_tensors(file1: Path, file2: Path, atol: float, rtol: float) -> dict:
-    """Compare two tensor files and return comparison results."""
-    arr1 = np.load(file1)
-    arr2 = np.load(file2)
-
-    result = {
-        "file": str(file1.name),
-        "shape1": arr1.shape,
-        "shape2": arr2.shape,
-        "dtype1": str(arr1.dtype),
-        "dtype2": str(arr2.dtype),
-        "match": False,
-        "error": None,
-    }
-
-    if arr1.shape != arr2.shape:
-        result["error"] = f"Shape mismatch: {arr1.shape} vs {arr2.shape}"
-        return result
-
-    if arr1.dtype != arr2.dtype:
-        result["error"] = f"Dtype mismatch: {arr1.dtype} vs {arr2.dtype}"
-        return result
-
-    arr1_flat = arr1.astype(np.float64).flatten()
-    arr2_flat = arr2.astype(np.float64).flatten()
-
-    abs_diff = np.abs(arr1_flat - arr2_flat)
-    max_abs_diff = np.max(abs_diff)
-    mean_abs_diff = np.mean(abs_diff)
-
-    with np.errstate(divide="ignore", invalid="ignore"):
-        rel_diff = abs_diff / (np.abs(arr2_flat) + 1e-12)
-        rel_diff = np.where(np.isfinite(rel_diff), rel_diff, 0)
-    max_rel_diff = np.max(rel_diff)
-    mean_rel_diff = np.mean(rel_diff)
-
-    result["max_abs_diff"] = float(max_abs_diff)
-    result["mean_abs_diff"] = float(mean_abs_diff)
-    result["max_rel_diff"] = float(max_rel_diff)
-    result["mean_rel_diff"] = float(mean_rel_diff)
-    result["match"] = np.allclose(arr1, arr2, atol=atol, rtol=rtol)
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Compare precision check outputs")
-    parser.add_argument("--dir1", required=True, help="First directory")
-    parser.add_argument("--dir2", required=True, help="Second directory")
-    parser.add_argument("--atol", type=float, default=1e-5, help="Absolute tolerance")
-    parser.add_argument("--rtol", type=float, default=1e-3, help="Relative tolerance")
-    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
-    args = parser.parse_args()
-
-    if not os.path.isdir(args.dir1):
-        print(f"Error: {args.dir1} is not a directory")
-        sys.exit(1)
-    if not os.path.isdir(args.dir2):
-        print(f"Error: {args.dir2} is not a directory")
-        sys.exit(1)
-
-    files1 = find_npy_files(args.dir1)
-    files2 = find_npy_files(args.dir2)
-
-    print(f"Directory 1: {args.dir1} ({len(files1)} files)")
-    print(f"Directory 2: {args.dir2} ({len(files2)} files)")
-    print(f"Tolerance: atol={args.atol}, rtol={args.rtol}")
-    print()
-
-    only_in_1 = set(files1.keys()) - set(files2.keys())
-    only_in_2 = set(files2.keys()) - set(files1.keys())
-    common = set(files1.keys()) & set(files2.keys())
-
-    if only_in_1:
-        print(f"Files only in dir1 ({len(only_in_1)}):")
-        for f in sorted(only_in_1):
-            print(f"  {f}")
-        print()
-
-    if only_in_2:
-        print(f"Files only in dir2 ({len(only_in_2)}):")
-        for f in sorted(only_in_2):
-            print(f"  {f}")
-        print()
-
-    if not common:
-        print("No common files to compare")
-        sys.exit(1)
-
-    print(f"Comparing {len(common)} common files...")
-    print()
-
-    passed = 0
-    failed = 0
-    errors = 0
-
-    for rel_path in sorted(common):
-        result = compare_tensors(files1[rel_path], files2[rel_path], args.atol, args.rtol)
-
-        if result["error"]:
-            errors += 1
-            print(f"ERROR: {rel_path}")
-            print(f"  {result['error']}")
-        elif result["match"]:
-            passed += 1
-            if args.verbose:
-                print(f"PASS: {rel_path}")
-                print(f"  max_abs={result['max_abs_diff']:.2e} max_rel={result['max_rel_diff']:.2e}")
-        else:
-            failed += 1
-            print(f"FAIL: {rel_path}")
-            print(f"  shape={result['shape1']} dtype={result['dtype1']}")
-            print(f"  max_abs={result['max_abs_diff']:.2e} mean_abs={result['mean_abs_diff']:.2e}")
-            print(f"  max_rel={result['max_rel_diff']:.2e} mean_rel={result['mean_rel_diff']:.2e}")
-
-    print()
-    print("=" * 50)
-    print(f"Summary: {passed} passed, {failed} failed, {errors} errors")
-    print(f"Missing: {len(only_in_1)} in dir1 only, {len(only_in_2)} in dir2 only")
-
-    if failed > 0 or errors > 0:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/run_models_and_profile.bash b/scripts/run_models_and_profile.bash
index 1cf27935..f99761c9 100755
--- a/scripts/run_models_and_profile.bash
+++ b/scripts/run_models_and_profile.bash
@@ -17,12 +17,19 @@ read_var() {
     jq -r --arg k "$key" '.variables[$k] // empty' "$CONFIG_FILE"
 }
 
-BUILD_DIR="$(read_var BUILD_DIR)";              : "${BUILD_DIR:=../build}"
-LOG_DIR="$(read_var LOG_DIR)";                  : "${LOG_DIR:=logs}"
-PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)";  : "${PROFILE_LOG_DIR:=./profile_logs}"
-COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)";  : "${COMPARE_LOG_DIR:=}"
-
+BUILD_DIR="$(read_var BUILD_DIR)";                  : "${BUILD_DIR:=../build}"
+LOG_DIR="$(read_var LOG_DIR)";                      : "${LOG_DIR:=logs}"
+PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)";      : "${PROFILE_LOG_DIR:=./profile_logs}"
+COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)";      : "${COMPARE_LOG_DIR:=}"
+FLASH="$(read_var FLASH)";                          : "${FLASH:=}"
+
+# --- 关键修改 1: 初始化大容量分区的绝对路径临时目录 ---
+# 先确保 build 目录存在，以便获取其绝对路径
 mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"
+# 获取绝对路径，防止 CMake 切换目录后找不到相对路径
+export CUSTOM_TMP="$(readlink -f "$BUILD_DIR")/tmp_cache"
+mkdir -p "$CUSTOM_TMP"
+export TMPDIR="$CUSTOM_TMP"
 
 # export custom PATHs
 export BUILD_DIR LOG_DIR PROFILE_LOG_DIR
@@ -34,11 +41,16 @@ done < <(jq -r '.variables | to_entries[] | "\(.key)=\(.value)"' "$CONFIG_FILE")
 # Global variable to save the last cmake command
 LAST_CMAKE_CMD=""
 
-# Clean the build directory
+# --- 关键修改 2: 在清理函数中重新创建临时目录 ---
 clean_build_dir() {
     echo -e "\033[1;31m[CLEAN] Removing all contents in: ${BUILD_DIR}\033[0m"
-    mkdir -p "$BUILD_DIR"
+    # 删除 build 下所有内容（这会删掉旧的 tmp_cache）
     rm -rf "${BUILD_DIR:?}/"*
+    # 重新创建 build 目录
+    mkdir -p "$BUILD_DIR"
+    # 核心：必须重新创建 TMPDIR 目录，否则编译器的路径会失效
+    mkdir -p "$TMPDIR"
+    echo -e "\033[1;34m[TMP] Re-created temp space at: $TMPDIR\033[0m"
 }
 
 # Run a command and log output
@@ -52,38 +64,27 @@ run_and_log() {
 
     echo -e "\033[1;32m============================================================\033[0m"
     echo -e "\033[1;36m[$timestamp] [Running] ${log_name}\033[0m"
-    
-    # Print the command being executed
     echo -e "\033[1;33mCommand:\033[0m $cmd"
-
-    # Print the most recent CMake command
     if [[ -n "$LAST_CMAKE_CMD" ]]; then
         echo -e "\033[1;34mLast CMake Command:\033[0m $LAST_CMAKE_CMD"
     fi
-
     echo -e "\033[1;33mLog file:\033[0m $log_path"
-
-    # Notify if profiling mode is enabled
     if [[ "$is_profile" == "yes" ]]; then
         echo -e "\033[1;35m[PROFILE MODE ON] Profiling logs will be saved to: ${PROFILE_LOG_DIR}\033[0m"
     fi
-
     echo -e "\033[1;32m============================================================\033[0m"
 
     pushd "$BUILD_DIR" > /dev/null
 
-    # Write the last cmake command into the log file if available
     if [[ -n "$LAST_CMAKE_CMD" ]]; then
         echo "[LAST_CMAKE] $LAST_CMAKE_CMD" > "$log_path"
     else
-        # If no cmake command has been run yet, clear the log
         > "$log_path"
     fi
 
-    # Write the current run command to the log
     echo "[COMMAND] $cmd" >> "$log_path"
 
-    # Run the command and append both stdout and stderr to the log file
+    # 执行命令并重定向输出
     if ! eval "$cmd" >> "$log_path" 2>&1; then
         echo -e "\033[1;31m============================================================\033[0m"
         echo -e "\033[1;31m[ERROR] Command failed: ${cmd}\033[0m"
@@ -97,39 +98,29 @@ run_and_log() {
 
     popd > /dev/null
 
-    # If profiling is enabled, move profiling files to the target directory
     if [[ "$is_profile" == "yes" ]]; then
         move_profile_logs "$log_name"
     fi
 }
 
-
 # Move profiling output logs
 move_profile_logs() {
     local prefix="$1"
-
-    # Move *.report.rankN files
     for report_file in "${BUILD_DIR}"/*.report.rank*; do
         if [[ -f "$report_file" ]]; then
-            local base_name
-            base_name=$(basename "$report_file")
+            local base_name=$(basename "$report_file")
             mv "$report_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
-            echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
         fi
     done
-
-    # Move *.records.log.rankN files
     for record_file in "${BUILD_DIR}"/*.records.log.rank*; do
         if [[ -f "$record_file" ]]; then
-            local base_name
-            base_name=$(basename "$record_file")
+            local base_name=$(basename "$record_file")
             mv "$record_file" "${PROFILE_LOG_DIR}/${prefix}_${base_name}"
-            echo "Moved $base_name to ${PROFILE_LOG_DIR}/${prefix}_${base_name}"
         fi
     done
 }
 
-# Build "--key value" arg string from tests[i].args (shell-escaped)
+# Build args string
 args_string_for_test() {
     local idx="$1"
     jq -r --argjson i "$idx" '
@@ -150,11 +141,10 @@ for ((id=0; id<num_builds; ++id)); do
 
     LAST_CMAKE_CMD="$build_cmake"
 
-    # always clean before another build
+    # 调用修改后的清理函数
     clean_build_dir
     run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no"
 
-    # profile flag for runs
     profile_flag="no"
     log_suffix=""
     if [[ "$build_profile" == "true" ]]; then
@@ -165,40 +155,28 @@ for ((id=0; id<num_builds; ++id)); do
     for ((ti=0; ti<num_tests; ++ti)); do
         test_id=$(jq -r ".tests[$ti].id" "$CONFIG_FILE")
         arg_str="$(args_string_for_test "$ti")"
+        global_flash_arg=""
+        if [[ -n "$FLASH" ]]; then
+            global_flash_arg="--flash=${FLASH}"
+        fi
 
-        # gpt2
-        gpt2_cmd="${prefix}./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${arg_str}"
+        gpt2_cmd="./gpt2 --input_bin ${GPT2_INPUT_BIN} --llmc_filepath ${GPT2_LLMC_FILEPATH} --device cuda ${global_flash_arg} ${arg_str}"
         run_and_log "$gpt2_cmd" "gpt2_${test_id}${log_suffix}" "$profile_flag"
 
-        # llama3
-        llama3_cmd="${prefix}./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${arg_str}"
+        llama3_cmd="./llama3 --input_bin ${LLAMA3_INPUT_BIN} --llmc_filepath ${LLAMA3_LLMC_FILEPATH} --device cuda ${global_flash_arg} ${arg_str}"
         run_and_log "$llama3_cmd" "llama3_${test_id}${log_suffix}" "$profile_flag"
     done
 done
 
 echo -e "\n\033[1;32mAll done.\033[0m"
 
-# Run comparison scripts if COMPARE_LOG_DIR is set
+# Comparison part
 if [[ -n "$COMPARE_LOG_DIR" ]]; then
     SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-
-    echo -e "\n\033[1;36m============================================================\033[0m"
-    echo -e "\033[1;36m[Comparison] Comparing logs with: ${COMPARE_LOG_DIR}\033[0m"
-    echo -e "\033[1;36m============================================================\033[0m"
-
-    # Run compare_loss.py
-    echo -e "\n\033[1;33m[Running] compare_loss.py\033[0m"
+    echo -e "\n\033[1;36m[Comparison] Comparing logs...\033[0m"
     python3 "${SCRIPT_DIR}/compare_loss.py" "$COMPARE_LOG_DIR" "$LOG_DIR" || true
-
-    # Run compare_tps.py
-    echo -e "\n\033[1;33m[Running] compare_tps.py\033[0m"
     python3 "${SCRIPT_DIR}/compare_tps.py" "$COMPARE_LOG_DIR" "$LOG_DIR" || true
-
     echo -e "\n\033[1;32mComparison completed.\033[0m"
 else
-    echo -e "\n\033[1;33m============================================================\033[0m"
-    echo -e "\033[1;33m[WARNING] COMPARE_LOG_DIR is not set. Skipping comparison.\033[0m"
-    echo -e "\033[1;33m         To enable comparison, set 'variables.COMPARE_LOG_DIR' in ${CONFIG_FILE}\033[0m"
-    echo -e "\033[1;33m         or export COMPARE_LOG_DIR=/path/to/baseline_logs before running.\033[0m"
-    echo -e "\033[1;33m============================================================\033[0m"
-fi
+    echo -e "\n\033[1;33m[WARNING] COMPARE_LOG_DIR is not set. Skipping comparison.\033[0m"
+fi
\ No newline at end of file
diff --git a/scripts/test_config.json b/scripts/test_config.json
index 5659b516..7f74fb39 100644
--- a/scripts/test_config.json
+++ b/scripts/test_config.json
@@ -1,13 +1,14 @@
 {
     "variables": {
         "BUILD_DIR": "../build",
+        "CUDA_VISIBLE_DEVICES": "4,5,6,7",
         "GPT2_INPUT_BIN": "../../data/llmc/gpt2/tinyshakespeare/tiny_shakespeare_train.bin",
         "GPT2_LLMC_FILEPATH": "../../data/llmc/gpt2/gpt2_124M.bin",
         "LLAMA3_INPUT_BIN": "../../data/llmc/llama3/tinyshakespeare/tiny_shakespeare_train.bin",
         "LLAMA3_LLMC_FILEPATH": "../../data/llmc/llama3/llama3.2_1B_fp32.bin",
         "PROFILE_LOG_DIR": "./profile_logs",
         "LOG_DIR": "./logs",
-        "COMPARE_LOG_DIR": ""
+        "COMPARE_LOG_DIR": "./compare_logs"
     },
     "builds": [
         {
@@ -22,12 +23,6 @@
         }
     ],
     "tests": [
-        {
-            "id": "1",
-            "args": {
-                "dtype": "float32"
-            }
-        },
         {
             "id": "1_bfloat16",
             "args": {
@@ -35,263 +30,10 @@
             }
         },
         {
-            "id": "2",
-            "args": {
-                "dtype": "float32",
-                "num_iteration": 10,
-                "batch_size": 80,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "2_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "num_iteration": 10,
-                "batch_size": 80,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "3",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "3_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "3_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120
-            }
-        },
-        {
-            "id": "3_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "4",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4
-            }
-        },
-        {
-            "id": "4_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "4_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4
-            }
-        },
-        {
-            "id": "4_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "5",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true
-            }
-        },
-        {
-            "id": "5_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "5_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true
-            }
-        },
-        {
-            "id": "5_bfloat16_distopt",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 4,
-                "sequence_parallel": true,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "6",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 8
-            }
-        },
-        {
-            "id": "6_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 8
-            }
-        },
-        {
-            "id": "7",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 4,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 4,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "7_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 4,
-                "num_iteration": 10,
-                "batch_size": 10,
-                "total_batch_size": 5120,
-                "pipeline_parallel": 4,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "8",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "8_distopt",
-            "args": {
-                "dtype": "float32",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2,
-                "use_distributed_optimizer": true
-            }
-        },
-        {
-            "id": "8_bfloat16",
-            "args": {
-                "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2
-            }
-        },
-        {
-            "id": "8_bfloat16_distopt",
+            "id": "1_bfloat16_fla",
             "args": {
                 "dtype": "bfloat16",
-                "nthread_per_process": 8,
-                "num_iteration": 10,
-                "batch_size": 40,
-                "total_batch_size": 5120,
-                "tensor_parallel": 2,
-                "sequence_parallel": true,
-                "pipeline_parallel": 2,
-                "virtual_pipeline_parallel": 2,
-                "use_distributed_optimizer": true
+                "flash": true
             }
         }
     ]
diff --git a/third_party/cudnn-frontend/.clang-format b/third_party/cudnn-frontend/.clang-format
new file mode 100644
index 00000000..ce6b18eb
--- /dev/null
+++ b/third_party/cudnn-frontend/.clang-format
@@ -0,0 +1,94 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: All
+AlwaysBreakAfterReturnType: All
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        4
+UseTab:          Never
+---
+Language: Json
+# Don't format .json files.
+DisableFormat: true
+...
+
diff --git a/third_party/cudnn-frontend/.github/ISSUE_TEMPLATE/bug_report.md b/third_party/cudnn-frontend/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..cf4b0051
--- /dev/null
+++ b/third_party/cudnn-frontend/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,44 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**System Environment (please complete the following information):**
+ - cudnn_frontend version: [e.g. v1.4.0]
+ - cudnn_backend version: [e.g. v9.1.0]
+ - GPU arch: [e.g. RTX 4090]
+ - cuda runtime version: [e.g. 12.4]
+ - cuda driver version: [e.g. 553.04]
+ - host compiler: [e.g. clang19]
+ - OS: [e.g. ubuntu22.04]
+
+**API logs**
+Please attach API logs for both cudnn_frontend and cudnn_backend.
+```
+// For cudnn_frontend
+export CUDNN_FRONTEND_LOG_FILE=fe.log
+export CUDNN_FRONTEND_LOG_INFO=1
+
+// For cudnn_backend
+export CUDNN_LOGLEVEL_DBG=3
+export CUDNN_LOGDEST_DBG=be.log
+```
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. '...'
+2. '....'
+3. '....'
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/third_party/cudnn-frontend/CMakeLists.txt b/third_party/cudnn-frontend/CMakeLists.txt
new file mode 100644
index 00000000..ff0438a6
--- /dev/null
+++ b/third_party/cudnn-frontend/CMakeLists.txt
@@ -0,0 +1,107 @@
+cmake_minimum_required(VERSION 3.23)
+
+project(cudnn_frontend VERSION 1.18.0)
+
+option(CUDNN_FRONTEND_SKIP_JSON_LIB "Defines whether FE should not include nlohmann/json.hpp." OFF)
+option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
+option(CUDNN_FRONTEND_BUILD_TESTS "Defines if unittests are built or not." ON)
+option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
+
+if(MSVC OR MSYS OR MINGW)
+    add_compile_options(/W4 /WX)
+else()
+    add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-error=attributes -Wno-attributes -Wno-error=unused-function -Wno-unused-function)
+endif()
+
+add_library(cudnn_frontend INTERFACE)
+
+# Add header files to library
+file(GLOB_RECURSE CUDNN_FRONTEND_INCLUDE_FILES "include/*")
+target_sources(
+    cudnn_frontend PUBLIC FILE_SET HEADERS
+    BASE_DIRS "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    FILES "${CUDNN_FRONTEND_INCLUDE_FILES}"
+)
+unset(CUDNN_FRONTEND_INCLUDE_FILES)
+
+target_compile_definitions(
+    cudnn_frontend INTERFACE
+    $<$<BOOL:${CUDNN_FRONTEND_SKIP_JSON_LIB}>:CUDNN_FRONTEND_SKIP_JSON_LIB>
+)
+
+target_include_directories(
+    cudnn_frontend INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+# Find the cuda compiler
+find_package(CUDAToolkit REQUIRED)
+
+target_include_directories(
+    cudnn_frontend INTERFACE
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+target_compile_features(cudnn_frontend INTERFACE cxx_std_17)
+
+# Make PCH for targets to link against
+add_library(_cudnn_frontend_pch INTERFACE)
+target_precompile_headers(_cudnn_frontend_pch INTERFACE ${PROJECT_SOURCE_DIR}/include/cudnn_frontend.h)
+
+if (CUDNN_FRONTEND_BUILD_SAMPLES)
+    add_subdirectory(samples)
+endif()
+
+if (CUDNN_FRONTEND_BUILD_TESTS)
+    add_subdirectory(test)
+endif()
+
+if (CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS)
+    add_subdirectory(python)
+endif()
+
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+# * CMAKE_INSTALL_BINDIR
+# * CMAKE_INSTALL_INCLUDEDIR
+include(GNUInstallDirs)
+
+# See https://cmake.org/cmake/help/latest/module/CMakePackageConfigHelpers.html#example-generating-package-files
+include(CMakePackageConfigHelpers)
+
+# Install the components
+install(
+    TARGETS cudnn_frontend
+    EXPORT cudnn_frontend_targets FILE_SET HEADERS
+)
+
+if (CUDNN_FRONTEND_BUILD_SAMPLES)
+    install(TARGETS legacy_samples samples RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+if (CUDNN_FRONTEND_BUILD_TESTS)
+    install(TARGETS tests RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+# Export the targets
+export(
+    EXPORT cudnn_frontend_targets
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend/cudnn_frontend-targets.cmake"
+)
+install(
+    EXPORT cudnn_frontend_targets
+    FILE cudnn_frontend-targets.cmake
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
+
+# Install the CMake configuration file for header discovery
+configure_package_config_file(
+    cudnn_frontend-config.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend-config.cmake"
+    INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
+install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend-config.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
diff --git a/third_party/cudnn-frontend/CONTRIBUTING.md b/third_party/cudnn-frontend/CONTRIBUTING.md
new file mode 100644
index 00000000..5ccb14ac
--- /dev/null
+++ b/third_party/cudnn-frontend/CONTRIBUTING.md
@@ -0,0 +1,55 @@
+# Contributing to cudnn-frontend
+
+If you are interested in contributing to cudnn-frontend, your contributions will fall
+into three categories:
+1. You want to report a bug, feature request, or documentation issue
+    - File an [issue](https://github.com/NVIDIA/cudnn-frontend/issues)
+    describing what you encountered or what you want to see changed.
+    - The cudnn team will evaluate the issues and triage them, scheduling
+    them for a release. If you believe the issue needs priority attention
+    comment on the issue to notify the team.
+2. You want to propose a new Feature and implement it
+    - Post about your intended feature, and we shall discuss the design and
+    implementation.
+    - Once we agree that the plan looks good, go ahead and implement it, using
+    the [code contributions](#code-contributions) guide below.
+3. You want to implement a feature or bug-fix for an outstanding issue
+    - Follow the [code contributions](#code-contributions) guide below.
+    - If you need more context on a particular issue, please ask and we shall
+    provide.
+
+## Code contributions
+
+### Your first issue
+
+1. Read the project's [README.md](https://github.com/NVIDIA/cudnn-frontend/blob/main/README.md)
+   to learn how to setup the development environment.
+2. Comment on the issue saying you are going to work on it and what changes you are going to make.
+3. Code! Make sure to update unit tests!
+4. When done, [create your pull request](https://github.com/NVIDIA/cudnn-frontend/compare).
+5. Wait for other developers to review your code and update code as needed.
+6. Once reviewed and approved, a cudnn-frontend developer will merge your pull request.
+7. At this time, we are accepting only small fixes, changes. Once merged to main this will be an untagged version. A release tag will be assigned along with future frontend release by cudnn team.
+
+Remember, if you are unsure about anything, don't hesitate to comment on issues and ask for clarifications!
+
+## Code Formatting
+
+Consistent code formatting is important in the cudnn-frontend project to ensure
+readability, maintainability, and thus simplifies collaboration.
+
+### Branches and Versions
+
+The cudnn-frontend repository has one main branch. Please submit a PR to this branch. We will update the doc as the policy changes.
+
+### Branch naming
+
+Branches used to create PRs should have a name of the form `<name>-issue-<issue_number>`
+which conforms to the following conventions:
+
+- Name:
+    - A name to convey what is being worked on
+    - Please use dashes or underscores between words as opposed to spaces.
+
+## Attribution
+Portions of contribution guide adopted from [https://github.com/rapidsai/cuml/blob/branch-24.04/CONTRIBUTING.md](https://github.com/rapidsai/cuml/blob/branch-24.04/CONTRIBUTING.md)
diff --git a/third_party/cudnn-frontend/LICENSE.txt b/third_party/cudnn-frontend/LICENSE.txt
new file mode 100644
index 00000000..eef9c446
--- /dev/null
+++ b/third_party/cudnn-frontend/LICENSE.txt
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */ 
diff --git a/third_party/cudnn-frontend/README.md b/third_party/cudnn-frontend/README.md
new file mode 100644
index 00000000..2616f208
--- /dev/null
+++ b/third_party/cudnn-frontend/README.md
@@ -0,0 +1,130 @@
+# cuDNN FrontEnd(FE)
+
+**cuDNN FE** is the modern, open-source entry point to the NVIDIA cuDNN library and high performance open-source kernels. It provides a C++ header-only library and a Python interface to access the powerful cuDNN Graph API and open-source kernels.
+
+## Key Features
+
+*   **Unified Graph API:** Create reusable, persistent `cudnn_frontend::graph::Graph` objects to describe complex subgraphs.
+*   **Ease of Use:** Simplified C++ and Python bindings (via `pybind11`) that abstract away the boilerplate of the backend API.
+*   **Performance:** Built-in autotuning and support for the latest NVIDIA GPU architectures.
+
+## Benchmarks
+
+To run the sdpa benchmarks, refer to [benchmarks/sdpa](https://github.com/NVIDIA/cudnn-frontend/blob/main/benchmark/sdpa_benchmark_training/README.md) folder. Current results:
+
+### GB200 - Llama 3.1 Causal (top_left)
+![Llama 3.1 Causal on GB200](https://raw.githubusercontent.com/NVIDIA/cudnn-frontend/main/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_top_left_causal.png) 
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB200 GPU
+
+### GB200 - Llama 3.1 Non-Causal (no_mask)
+![Llama 3.1 Non-Causal on GB200](https://raw.githubusercontent.com/NVIDIA/cudnn-frontend/main/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_no_mask.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=False`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB200 GPU
+
+### GB200 - DeepSeek V3 Causal (top_left)
+![DeepSeek V3 Causal on GB200](https://raw.githubusercontent.com/NVIDIA/cudnn-frontend/main/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=128; num_kv_heads=128; head_dim_qk=192; head_dim_vo=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB200 GPU
+
+### GB300 - Llama 3.1 Causal (top_left)
+![Llama 3.1 Causal on GB300](https://raw.githubusercontent.com/NVIDIA/cudnn-frontend/main/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB300 GPU
+
+### GB300 - Llama 3.1 Non-Causal (no_mask)
+![Llama 3.1 Non-Causal on GB300](https://raw.githubusercontent.com/NVIDIA/cudnn-frontend/main/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_no_mask.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=False`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB300 GPU
+
+### GB300 - DeepSeek V3 Causal (top_left)
+![DeepSeek V3 Causal on GB300](https://raw.githubusercontent.com/NVIDIA/cudnn-frontend/main/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=128; num_kv_heads=128; head_dim_qk=192; head_dim_vo=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB300 GPU
+
+
+## Installation
+
+### 🐍 Python
+
+The easiest way to get started is via pip:
+
+```bash
+pip install nvidia_cudnn_frontend
+```
+
+**Requirements:**
+*   Python 3.8+
+*   NVIDIA driver and CUDA Toolkit
+
+### ⚙️ C++ (Header Only)
+
+Since the C++ API is header-only, integration is seamless. Simply include the header in your compilation unit:
+
+```cpp
+#include <cudnn_frontend.h>
+```
+
+Ensure your include path points to the `include/` directory of this repository.
+
+## Building from Source
+
+If you want to build the Python bindings from source or run the C++ samples:
+
+**1. Dependencies**
+*   `python-dev` (e.g., `apt-get install python-dev`)
+*   Dependencies listed in `requirements.txt` (`pip install -r requirements.txt`)
+
+**2. Python Source Build**
+```bash
+pip install -v git+https://github.com/NVIDIA/cudnn-frontend.git
+```
+*Environment variables `CUDAToolkit_ROOT` and `CUDNN_PATH` can be used to override default paths.*
+
+**3. C++ Samples Build**
+```bash
+mkdir build && cd build
+cmake -DCUDNN_PATH=/path/to/cudnn -DCUDAToolkit_ROOT=/path/to/cuda ../
+cmake --build . -j16
+./bin/samples
+```
+
+## Documentation & Examples
+
+*   **Developer Guide:** [Official NVIDIA Documentation](https://docs.nvidia.com/deeplearning/cudnn/frontend/v1.9.0/developer/overview.html)
+*   **C++ Samples:** See `samples/cpp` for comprehensive usage examples.
+*   **Python Samples:** See `samples/python` for pythonic implementations.
+
+## 🤝 Contributing
+
+We strictly welcome contributions! Whether you are fixing a bug, improving documentation, or optimizing one of our new OSS kernels, your help makes cuDNN better for everyone.
+
+1.  Check the [Contribution Guide](CONTRIBUTING.md) for details.
+2.  Fork the repo and create your branch.
+3.  Submit a Pull Request.
+
+## Debugging
+
+To view the execution flow and debug issues, you can enable logging via environment variables:
+
+```bash
+# Log to stdout
+export CUDNN_FRONTEND_LOG_INFO=1
+export CUDNN_FRONTEND_LOG_FILE=stdout
+
+# Log to a file
+export CUDNN_FRONTEND_LOG_INFO=1
+export CUDNN_FRONTEND_LOG_FILE=execution_log.txt
+```
+
+Alternatively, you can control logging programmatically via `cudnn_frontend::isLoggingEnabled()`
+
+## License
+
+This project is licensed under the [MIT License](LICENSE).
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/Dockerfile b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/Dockerfile
new file mode 100644
index 00000000..8a0efd6d
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/Dockerfile
@@ -0,0 +1,28 @@
+FROM nvcr.io/nvidia/pytorch:25.12-py3
+
+# Set working directory
+WORKDIR /workspace
+
+# Update libcudnn9-cuda-13
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    apt-get remove -y *cudnn9* && \
+    apt-get update && \
+    apt-get -y install cudnn && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Clone cudnn_frontend and install latest cudnn
+RUN git clone https://github.com/NVIDIA/cudnn-frontend.git
+RUN pip install -v cudnn-frontend
+
+# Clone flash-attention
+RUN pip uninstall -y flash-attn && \
+    git clone https://github.com/Dao-AILab/flash-attention.git && \
+    cd flash-attention && \
+    sed -i 's/^    import flash_attn_2_cuda as flash_attn_gpu$/    pass/' /workspace/flash-attention/flash_attn/flash_attn_interface.py
+RUN pip install nvidia-cutlass-dsl apache-tvm-ffi quack-kernels
+ENV PYTHONPATH=/workspace/flash-attention
+
+# Install additional dependencies for benchmarking
+RUN pip install seaborn
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/README.md b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/README.md
new file mode 100644
index 00000000..87368634
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/README.md
@@ -0,0 +1,234 @@
+# Scaled Dot Product Attention Benchmark
+
+## Introduction
+
+This directory contains benchmarking tools for Scaled Dot Product Attention (SDPA) operations across various backends. The benchmarks target training use cases with support for causal masking and grouped query attention (GQA).
+
+## Contents
+
+- `Dockerfile` - Docker container setup for running benchmarks
+- `benchmark_single_sdpa.py` - Single SDPA benchmark script
+- `configs/` - Benchmark configuration files
+  - `llama.py` - Llama 3.1 GQA benchmarks (causal + non-causal)
+  - `dsv3.py` - DeepSeek V3 MHA benchmarks (causal only)
+- `runner.py` - Configuration-based benchmark runner
+- `config_types.py` - Data types for benchmark configuration
+- `charts.py` - Chart generation utilities
+- `../results/` - Benchmark outputs (CSV and charts)
+
+## Quick Start
+
+### 1. Build Docker Container
+
+```bash
+docker build -t cudnn_attention_benchmark .
+
+docker run -it --gpus all --rm cudnn_attention_benchmark
+```
+
+### 2. Run Benchmarks
+
+```bash
+# Run Llama 3.1 benchmark suite
+python -m benchmark.sdpa_benchmark_training.runner --config llama
+
+# Run DeepSeek V3 benchmark suite
+python -m benchmark.sdpa_benchmark_training.runner --config dsv3
+
+# Dry run (show what would be executed)
+python -m benchmark.sdpa_benchmark_training.runner --config llama --dry-run
+
+# Filter by backend
+python -m benchmark.sdpa_benchmark_training.runner --config llama --backend cudnn
+
+# Filter by data type
+python -m benchmark.sdpa_benchmark_training.runner --config llama --dtype bfloat16
+```
+
+## Configuration-Based Benchmarking
+
+### Creating Custom Configurations
+
+1. Copy the template:
+   ```bash
+   cp configs/llama.py configs/my_config.py
+   ```
+
+2. Edit your config:
+   ```python
+   from ..config_types import ModelPreset, BenchmarkConfig
+
+   MY_MODEL = ModelPreset(
+       name="my_model",
+       num_q_heads=32,
+       num_kv_heads=8,
+       head_dim=128,
+   )
+
+   CONFIG = BenchmarkConfig(
+       name="my_benchmark",
+       models=[MY_MODEL],
+       seqlens=[(4096, 4096), (8192, 8192)],
+       backends=["cudnn", "flash_attention_4"],
+       data_types=["bfloat16", "fp8"],
+       attn_masks=["top_left", "no_mask"],
+       profile_pass="fwd",  # "fwd", "bwd", or "both"
+       num_iterations=10,
+   )
+   ```
+
+3. Run:
+   ```bash
+   python -m benchmark.sdpa_benchmark_training.runner --config my_config
+   ```
+
+### Configuration Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `models` | List of `ModelPreset` to benchmark | Required |
+| `seqlens` | List of `(q_seqlen, kv_seqlen)` tuples | Required |
+| `backends` | Backends to compare | `["cudnn"]` |
+| `data_types` | Data types to test | `["bfloat16"]` |
+| `attn_masks` | Attention masks (`top_left`, `no_mask`, `bottom_right`) | `["top_left"]` |
+| `profile_pass` | Which pass to profile (`fwd`, `bwd`, `both`) | `"fwd"` |
+| `batch_size` | Batch size | `1` |
+| `num_iterations` | Iterations per benchmark | `10` |
+| `deterministic_bwd` | Deterministic modes for backward | `[False]` |
+
+### Model Presets
+
+Standard model:
+```python
+LLAMA3_1 = ModelPreset(
+    name="llama3.1",
+    num_q_heads=64,
+    num_kv_heads=8,
+    head_dim=128,
+)
+```
+
+Asymmetric head dimensions (DeepSeek V3):
+```python
+DSV3 = ModelPreset(
+    name="dsv3",
+    num_q_heads=128,
+    num_kv_heads=128,
+    head_dim_qk=192,  # Q/K head dimension
+    head_dim_vo=128,  # V/O head dimension
+)
+```
+
+### Output
+
+The runner produces (in `benchmark/results/`):
+- **CSV**: `<config>_<timestamp>.csv`
+- **Charts**: Separate chart per mask type:
+  - `<config>_top_left.png` (causal)
+  - `<config>_no_mask.png` (non-causal)
+- Charts show backends side-by-side with distinct colors for BF16 vs FP8
+
+## Single Benchmark Script
+
+For running individual benchmarks:
+
+```bash
+# cuDNN Frontend (BF16)
+python benchmark_single_sdpa.py \
+    --batch_size 1 --q_seqlen 8192 --kv_seqlen 8192 \
+    --num_q_heads 64 --num_kv_heads 8 --head_dim 128 \
+    --sdpa_backend cudnn --data_type bfloat16 \
+    --attn_mask top_left --fwd_bwd
+
+# cuDNN Frontend (FP8)
+python benchmark_single_sdpa.py \
+    --batch_size 1 --q_seqlen 8192 --kv_seqlen 8192 \
+    --num_q_heads 64 --num_kv_heads 8 --head_dim 128 \
+    --sdpa_backend cudnn --data_type fp8 \
+    --attn_mask top_left --fwd_bwd
+
+# FlashAttention 4
+python benchmark_single_sdpa.py \
+    --batch_size 1 --q_seqlen 8192 --kv_seqlen 8192 \
+    --num_q_heads 64 --num_kv_heads 8 --head_dim 128 \
+    --sdpa_backend flash_attention_4 --data_type bfloat16 \
+    --attn_mask top_left --fwd_bwd
+```
+
+Run `python benchmark_single_sdpa.py --help` for all options.
+
+## Programmatic Usage
+
+```python
+from benchmark.sdpa_benchmark_training import (
+    BenchmarkRunner,
+    BenchmarkConfig,
+    ModelPreset,
+    load_config,
+)
+
+# Load existing config
+config = load_config("llama")
+
+# Or create programmatically
+config = BenchmarkConfig(
+    name="custom",
+    models=[ModelPreset("test", 64, 8, 128)],
+    seqlens=[(4096, 4096)],
+    backends=["cudnn"],
+)
+
+runner = BenchmarkRunner()
+results = runner.run_config(config)
+runner.save_csv(results, config)
+```
+
+## Supported Backends
+
+| Backend | Description |
+|---------|-------------|
+| `cudnn` | cuDNN (native, via cuDNN Frontend) |
+| `flash_attention_4` | FlashAttention 4 |
+| `flash_attention_3` | FlashAttention 3 |
+| `pyt_flash_attention` | PyTorch FlashAttention |
+| `pyt_cudnn` | PyTorch cuDNN backend |
+| `pyt_efficient_attention` | PyTorch xFormers |
+
+## Benchmark Results
+
+### GB200 - Llama 3.1 Causal (top_left)
+![Llama 3.1 Causal on GB200](results/gb200_918_only_cudnn/llama3.1_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB200 GPU
+
+### GB200 - Llama 3.1 Non-Causal (no_mask)
+![Llama 3.1 Non-Causal on GB200](results/gb200_918_only_cudnn/llama3.1_no_mask.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=False`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB200 GPU
+
+### GB200 - DeepSeek V3 Causal (top_left)
+![DeepSeek V3 Causal on GB200](results/gb200_918_only_cudnn/dsv3_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=128; num_kv_heads=128; head_dim_qk=192; head_dim_vo=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB200 GPU
+
+### GB300 - Llama 3.1 Causal (top_left)
+![Llama 3.1 Causal on GB300](results/gb300_918_only_cudnn/llama3.1_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB300 GPU
+
+### GB300 - Llama 3.1 Non-Causal (no_mask)
+![Llama 3.1 Non-Causal on GB300](results/gb300_918_only_cudnn/llama3.1_no_mask.png)
+- SDPA parameters: `batch=1; num_q_heads=64; num_kv_heads=8; head_dim=128; is_causal=False`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB300 GPU
+
+### GB300 - DeepSeek V3 Causal (top_left)
+![DeepSeek V3 Causal on GB300](results/gb300_918_only_cudnn/dsv3_top_left_causal.png)
+- SDPA parameters: `batch=1; num_q_heads=128; num_kv_heads=128; head_dim_qk=192; head_dim_vo=128; is_causal=True`
+- Sequence lengths shown on x-axis
+- Results obtained on NVIDIA GB300 GPU
+
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/__init__.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/__init__.py
new file mode 100644
index 00000000..dd665e2d
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/__init__.py
@@ -0,0 +1,40 @@
+"""
+SDPA Benchmark Training Package
+
+This package provides a flexible benchmark configuration system for
+Scaled Dot Product Attention (SDPA) operations.
+
+Usage:
+    # Run benchmarks from command line
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf
+
+    # Dry run to see what would be executed
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf --dry-run
+
+    # Import and use programmatically
+    from benchmark.sdpa_benchmark_training import (
+        BenchmarkRunner,
+        BenchmarkConfig,
+        BenchmarkResult,
+        ModelPreset,
+        load_config,
+    )
+
+    config = load_config("mlperf")
+    runner = BenchmarkRunner()
+    results = runner.run_config(config)
+    runner.save_csv(results, config)
+"""
+
+from .config_types import ModelPreset, BenchmarkConfig, BenchmarkResult
+from .configs import load_config, list_configs
+from .runner import BenchmarkRunner
+
+__all__ = [
+    "ModelPreset",
+    "BenchmarkConfig",
+    "BenchmarkResult",
+    "BenchmarkRunner",
+    "load_config",
+    "list_configs",
+]
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py
new file mode 100644
index 00000000..b9d83810
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py
@@ -0,0 +1,1366 @@
+"""
+Scaled Dot Product Attention (SDPA) benchmark
+
+This script benchmarks a single SDPA compute instance.
+The SDPA backend can be chosen. Performance is measured using torch profiler.
+
+Can be used as CLI or imported as a module:
+
+    # CLI usage
+    python benchmark_single_sdpa.py --batch_size 1 --q_seqlen 8192 ...
+
+    # Module usage
+    from benchmark_single_sdpa import run_benchmark
+    result = run_benchmark(batch_size=1, q_seqlen=8192, ...)
+"""
+
+import argparse
+import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.nn.attention.bias import causal_lower_right
+import os
+import numpy as np
+import functools
+import time
+import math
+from typing import Optional, Dict, Any
+
+from torch.profiler import profile, record_function, ProfilerActivity
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--batch_size", default=1, type=int, help="Batch size to input to the layer")
+    parser.add_argument("--q_seqlen", default=8192, type=int, help="Sequence length to input to the layer")
+    parser.add_argument("--kv_seqlen", default=8192, type=int, help="Sequence length to input to the layer")
+    parser.add_argument(
+        "--num_q_heads",
+        default=16,
+        type=int,
+        help="Number of query heads to input to the layer",
+    )
+    parser.add_argument(
+        "--num_kv_heads",
+        default=8,
+        type=int,
+        help="Number of key/value heads to input to the layer",
+    )
+    parser.add_argument("--head_dim", default=128, type=int, help="Head dimension to input to the layer")
+    parser.add_argument(
+        "--head_dim_qk",
+        default=None,
+        type=int,
+        help="Optional: head dimension for Q/K. If set, must also set --head_dim_vo",
+    )
+    parser.add_argument(
+        "--head_dim_vo",
+        default=None,
+        type=int,
+        help="Optional: head dimension for V/O. If set, must also set --head_dim_qk",
+    )
+    parser.add_argument(
+        "--data_type",
+        default="bfloat16",
+        type=str,
+        help="Data type to input to the layer. Can be bfloat16, float16, or fp8",
+    )
+    parser.add_argument(
+        "--num_iterations",
+        default=20,
+        type=int,
+        help="Number of iterations to run the layer for performance measurement",
+    )
+    parser.add_argument(
+        "--num_warmup_iterations",
+        default=0,
+        type=int,
+        help="Number of warmup iterations to run before measuring performance",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    parser.add_argument(
+        "--fwd_bwd",
+        action="store_true",
+        help="Run both forward and backward pass (fwd only by default)",
+    )
+    parser.add_argument(
+        "--profile_pass",
+        default=None,
+        type=str,
+        choices=["fwd", "bwd", "both"],
+        help="Which pass to profile (default: fwd unless --fwd_bwd is set).",
+    )
+    parser.add_argument(
+        "--deterministic_bwd",
+        action="store_true",
+        help="Use deterministic algorithm for backward pass where supported (cudnn FP16/BF16/FP8)",
+    )
+    parser.add_argument(
+        "--attn_mask",
+        default="no_mask",
+        type=str,
+        help="Attn mask to use. Can be 'top_left', 'bottom_right', or 'no_mask'.",
+        choices=["top_left", "bottom_right", "no_mask"],
+    )
+    parser.add_argument(
+        "--sdpa_backend",
+        default="pyt_cudnn",
+        type=str,
+        help="SDPA backend to use",
+        choices=[
+            "pyt_math",
+            "pyt_cudnn",
+            "pyt_efficient_attention",
+            "pyt_flash_attention",
+            "flash_attention",
+            "flash_attention_3",
+            "flash_attention_4",
+            "cudnn",
+        ],
+    )
+    parser.add_argument("--format_output", action="store_true", help="Format output to be used in benchmark")
+    parser.add_argument(
+        "--case_tag",
+        default="",
+        type=str,
+        help="Tag to identify the case. Not used in calculations. Only for formatted output",
+    )
+    parser.add_argument(
+        "--skip_ref",
+        action="store_true",
+        help="Skip reference SDPA implementation",
+    )
+    return parser.parse_args()
+
+
+def run_benchmark(
+    batch_size: int,
+    q_seqlen: int,
+    kv_seqlen: int,
+    num_q_heads: int,
+    num_kv_heads: int,
+    head_dim: int = 128,
+    head_dim_qk: Optional[int] = None,
+    head_dim_vo: Optional[int] = None,
+    data_type: str = "bfloat16",
+    backend: str = "cudnn",
+    attn_mask: str = "no_mask",
+    profile_pass: str = "fwd",
+    num_iterations: int = 10,
+    num_warmup_iterations: int = 0,
+    skip_ref: bool = True,
+    deterministic_bwd: bool = False,
+    verbose: bool = False,
+) -> Dict[str, Any]:
+    """
+    Run a single SDPA benchmark.
+
+    This function can be called directly when using the module as a library.
+    Internally uses subprocess to call this script with the appropriate arguments.
+
+    Args:
+        batch_size: Batch size
+        q_seqlen: Query sequence length
+        kv_seqlen: Key/value sequence length
+        num_q_heads: Number of query heads
+        num_kv_heads: Number of key/value heads
+        head_dim: Head dimension (used if head_dim_qk/vo not specified)
+        head_dim_qk: Head dimension for Q/K (optional, for asymmetric)
+        head_dim_vo: Head dimension for V/O (optional, for asymmetric)
+        data_type: Data type ("bfloat16", "float16", "fp8")
+        backend: Backend name ("cudnn", "flash_attention_4", etc.)
+        attn_mask: Attention mask ("no_mask", "top_left", "bottom_right")
+        profile_pass: Which pass to profile ("fwd", "bwd", "both")
+        num_iterations: Number of benchmark iterations
+        num_warmup_iterations: Warmup iterations before measurement
+        skip_ref: Skip reference validation
+        deterministic_bwd: Use deterministic backward algorithm
+        verbose: Print verbose output
+
+    Returns:
+        Dict with keys:
+            - fwd_time_ms: Median forward time in milliseconds
+            - bwd_time_ms: Median backward time in milliseconds (0 if not run)
+            - fwd_tflops: Forward TFLOPS
+            - bwd_tflops: Backward TFLOPS
+            - max_diff: Maximum difference vs reference
+            - gpu_name: GPU name string
+            - cudnn_version: cuDNN version (if available)
+
+    Raises:
+        RuntimeError: If the benchmark subprocess fails
+    """
+    import subprocess
+    import sys
+
+    # Build command
+    script_path = os.path.abspath(__file__)
+    cmd = [
+        sys.executable,
+        script_path,
+        "--batch_size",
+        str(batch_size),
+        "--q_seqlen",
+        str(q_seqlen),
+        "--kv_seqlen",
+        str(kv_seqlen),
+        "--num_q_heads",
+        str(num_q_heads),
+        "--num_kv_heads",
+        str(num_kv_heads),
+        "--data_type",
+        data_type,
+        "--sdpa_backend",
+        backend,
+        "--attn_mask",
+        attn_mask,
+        "--num_iterations",
+        str(num_iterations),
+        "--num_warmup_iterations",
+        str(num_warmup_iterations),
+        "--format_output",  # Get CSV-formatted output for parsing
+    ]
+
+    # Handle head dimensions
+    if head_dim_qk is not None and head_dim_vo is not None:
+        cmd.extend(["--head_dim_qk", str(head_dim_qk)])
+        cmd.extend(["--head_dim_vo", str(head_dim_vo)])
+    else:
+        cmd.extend(["--head_dim", str(head_dim)])
+
+    # Handle profile pass
+    if profile_pass == "both":
+        cmd.append("--fwd_bwd")
+    elif profile_pass in ("fwd", "bwd"):
+        cmd.extend(["--profile_pass", profile_pass])
+
+    # Handle flags
+    if skip_ref:
+        cmd.append("--skip_ref")
+    if deterministic_bwd:
+        cmd.append("--deterministic_bwd")
+    if verbose:
+        cmd.append("--verbose")
+
+    # Run benchmark
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+    if result.returncode != 0:
+        raise RuntimeError(f"Benchmark failed with return code {result.returncode}.\n" f"stderr: {result.stderr}\n" f"stdout: {result.stdout}")
+
+    # Parse CSV output
+    # Format: case_tag,backend,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim,fwd_time,bwd_time,fwd_tflops,bwd_tflops,max_diff,num_iters
+    output_line = result.stdout.strip().split("\n")[-1]
+    parts = output_line.split(",")
+
+    if len(parts) < 12:
+        raise RuntimeError(f"Unexpected output format: {output_line}")
+
+    # Get GPU name from torch
+    gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "Unknown"
+
+    # Try to get cudnn version
+    cudnn_version = None
+    cudnn_backend_version = None
+    try:
+        import cudnn
+
+        cudnn_version = cudnn.__version__
+        cudnn_backend_version = cudnn.backend_version()
+    except ImportError:
+        pass
+
+    return {
+        "fwd_time_ms": float(parts[8]),
+        "bwd_time_ms": float(parts[9]),
+        "fwd_tflops": float(parts[10]),
+        "bwd_tflops": float(parts[11]),
+        "max_diff": float(parts[12]) if len(parts) > 12 else 0.0,
+        "gpu_name": gpu_name,
+        "cudnn_version": cudnn_version,
+        "cudnn_backend_version": cudnn_backend_version,
+    }
+
+
+# ============================================================================
+# Main benchmark implementation (runs when script is executed directly)
+# ============================================================================
+
+# Note: All code below this point is only executed when running as a script.
+# When imported as a module, use the run_benchmark() function above.
+
+if __name__ != "__main__":
+    # Stop here when imported as module
+    pass
+else:
+    # Parse command line arguments
+    args = parse_args()
+
+    if args.data_type == "bfloat16":
+        target_dtype = torch.bfloat16
+    elif args.data_type == "float16":
+        target_dtype = torch.float16
+    elif args.data_type == "float":
+        target_dtype = torch.float
+    elif args.data_type == "fp8":
+        target_dtype = None
+    else:
+        raise ValueError(f"Invalid data type: {args.data_type}")
+
+    if args.data_type == "fp8":
+        if args.sdpa_backend not in ["cudnn", "flash_attention_3"]:
+            raise ValueError(f"FP8 is only supported for cudnn and flash_attention_3 backends")
+
+    # Parse input arguments
+    num_iters = args.num_iterations
+    dry_run_iters = args.num_warmup_iterations
+    batch_size = args.batch_size
+    q_seqlen = args.q_seqlen
+    kv_seqlen = args.kv_seqlen
+    num_q_heads = args.num_q_heads
+    num_kv_heads = args.num_kv_heads
+    if args.head_dim_qk is None and args.head_dim_vo is None:
+        head_dim_qk = args.head_dim
+        head_dim_vo = args.head_dim
+    elif args.head_dim_qk is not None and args.head_dim_vo is not None:
+        head_dim_qk = args.head_dim_qk
+        head_dim_vo = args.head_dim_vo
+    else:
+        raise ValueError("Both --head_dim_qk and --head_dim_vo must be provided together when using asymmetric head dims.")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    assert device.type == "cuda", "Requires CUDA device"
+    if args.profile_pass is not None:
+        run_fwd = args.profile_pass in ("fwd", "both")
+        run_bwd = args.profile_pass in ("bwd", "both")
+    elif args.fwd_bwd:
+        run_fwd = True
+        run_bwd = True
+    else:
+        run_fwd = True
+        run_bwd = False
+    enable_gqa = num_q_heads != num_kv_heads
+    assert args.attn_mask != "bottom_right" or q_seqlen <= kv_seqlen, "Bottom right causal mask not supported when q_seqlen > kv_seqlen"
+    # if args.sdpa_backend in ["flash_attention", "flash_attention_3", "pyt_flash_attention"]:
+    #     assert args.attn_mask != "top_left", "Flash Attention does not support top left causal mask"
+
+    l2_flush_size_mb = 256
+    l2_flush_size = l2_flush_size_mb * 1024 * 1024
+    l2_flush_buffer = torch.empty(l2_flush_size, device=device, dtype=torch.int8)
+
+    #############################################################
+    ########### Set up SDPA function for each backend ###########
+
+    ## If using cuDNN FE, set up cuDNN graph.
+    if args.sdpa_backend == "cudnn":
+        is_dropout = False  # Hard coded
+        dropout_prob = dropout_p if is_dropout else 0.0  # Hard coded to 0
+        is_infer = False  # Hard coded
+        attn_scale = head_dim_qk ** (-0.5)
+
+        try:
+            import cudnn
+        except ImportError:
+            cudnn = None
+        assert cudnn is not None
+
+        if args.verbose:
+            print(f"[INFO] cuDNN Backend Version: {cudnn.backend_version() = }")
+            print(f"[INFO] cuDNN Frontend Version: {cudnn.__version__ = }")
+
+        # Helper function: Convert torch type to cuDNN type
+        def convert_to_cudnn_type(torch_type):
+            if torch_type == torch.float16:
+                return cudnn.data_type.HALF
+            elif torch_type == torch.bfloat16:
+                return cudnn.data_type.BFLOAT16
+            elif torch_type == torch.float32:
+                return cudnn.data_type.FLOAT
+            elif torch_type == torch.int32:
+                return cudnn.data_type.INT32
+            elif torch_type == torch.int64:
+                return cudnn.data_type.INT64
+            else:
+                raise ValueError("Unsupported tensor data type.")
+
+        ## Will define tensors to set up cuDNN graph once.
+        if args.data_type == "fp8":
+            query = torch.randint(
+                256,
+                (batch_size, q_seqlen, num_q_heads, head_dim_qk),
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+            key = torch.randint(
+                256,
+                (batch_size, kv_seqlen, num_kv_heads, head_dim_qk),
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+            value = torch.randint(
+                256,
+                (batch_size, kv_seqlen, num_kv_heads, head_dim_vo),
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+            output = torch.empty(
+                batch_size,
+                q_seqlen,
+                num_q_heads,
+                head_dim_vo,
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+
+            descale_q_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_k_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_v_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_s_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_o_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_dO_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_dP_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_s_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_o_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dQ_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dK_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dV_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dP_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_s_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_o_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dQ_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dK_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dV_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dP_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+        else:
+            query = torch.randn(
+                batch_size,
+                q_seqlen,
+                num_q_heads,
+                head_dim_qk,
+                dtype=target_dtype,
+                device=device,
+            ).transpose(1, 2)
+            key = torch.randn(
+                batch_size,
+                kv_seqlen,
+                num_kv_heads,
+                head_dim_qk,
+                dtype=target_dtype,
+                device=device,
+            ).transpose(1, 2)
+            value = torch.randn(
+                batch_size,
+                kv_seqlen,
+                num_kv_heads,
+                head_dim_vo,
+                dtype=target_dtype,
+                device=device,
+            ).transpose(1, 2)
+            output = torch.empty(
+                batch_size,
+                q_seqlen,
+                num_q_heads,
+                head_dim_vo,
+                dtype=target_dtype,
+                device=device,
+            ).transpose(1, 2)
+
+        dQuery = torch.empty_like(query)
+        dKey = torch.empty_like(key)
+        dValue = torch.empty_like(value)
+        if args.data_type == "fp8":
+            # Create as bfloat16, convert to FP8, then view as uint8 to avoid DLPack issues
+            dOutput_bf16 = torch.randn(output.shape, dtype=torch.bfloat16, device=device)
+            dOutput_fp8 = dOutput_bf16.to(torch.float8_e4m3fn)
+            dOutput = dOutput_fp8.view(torch.uint8)
+        else:
+            dOutput = torch.randn_like(output)
+        stats = torch.randn(batch_size, q_seqlen, num_q_heads, 1, dtype=torch.float32, device=device).transpose(1, 2)
+        if is_dropout:
+            dropout_seed = torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda")
+            dropout_offset = torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda")
+
+        # cuDNN graph forward
+        graph_fwd = cudnn.pygraph(
+            io_data_type=(cudnn.data_type.FP8_E4M3 if args.data_type == "fp8" else convert_to_cudnn_type(target_dtype)),
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+        )
+
+        if is_dropout:
+            seed_fwd = graph_fwd.tensor_like(dropout_seed)
+            offset_fwd = graph_fwd.tensor_like(dropout_offset)
+            dropout_tuple = (dropout_prob, seed_fwd, offset_fwd)
+
+        if args.data_type == "fp8":
+            q_fwd = graph_fwd.tensor_like(query).set_data_type(cudnn.data_type.FP8_E4M3)
+            k_fwd = graph_fwd.tensor_like(key).set_data_type(cudnn.data_type.FP8_E4M3)
+            v_fwd = graph_fwd.tensor_like(value).set_data_type(cudnn.data_type.FP8_E4M3)
+
+            descale_q_fwd = graph_fwd.tensor_like(descale_q_gpu)
+            descale_k_fwd = graph_fwd.tensor_like(descale_k_gpu)
+            descale_v_fwd = graph_fwd.tensor_like(descale_v_gpu)
+            descale_s_fwd = graph_fwd.tensor_like(descale_s_gpu)
+            scale_s_fwd = graph_fwd.tensor_like(scale_s_gpu)
+            scale_o_fwd = graph_fwd.tensor_like(scale_o_gpu)
+
+            o_fwd, stats_fwd, amax_s_fwd, amax_o_fwd = graph_fwd.sdpa_fp8(
+                q=q_fwd,
+                k=k_fwd,
+                v=v_fwd,
+                descale_q=descale_q_fwd,
+                descale_k=descale_k_fwd,
+                descale_v=descale_v_fwd,
+                descale_s=descale_s_fwd,
+                scale_s=scale_s_fwd,
+                scale_o=scale_o_fwd,
+                # generate_stats=not is_infer,
+                is_inference=is_infer,
+                attn_scale=attn_scale,
+                diagonal_alignment=(cudnn.diagonal_alignment.BOTTOM_RIGHT if args.attn_mask == "bottom_right" else cudnn.diagonal_alignment.TOP_LEFT),
+                right_bound=None if args.attn_mask == "no_mask" else 0,
+                # dropout=dropout_tuple if is_dropout else None,
+            )
+        else:
+            q_fwd = graph_fwd.tensor_like(query)
+            k_fwd = graph_fwd.tensor_like(key)
+            v_fwd = graph_fwd.tensor_like(value)
+            o_fwd, stats_fwd = graph_fwd.sdpa(
+                q=q_fwd,
+                k=k_fwd,
+                v=v_fwd,
+                # generate_stats=not is_infer,
+                is_inference=is_infer,
+                attn_scale=attn_scale,
+                diagonal_alignment=(cudnn.diagonal_alignment.BOTTOM_RIGHT if args.attn_mask == "bottom_right" else cudnn.diagonal_alignment.TOP_LEFT),
+                diagonal_band_right_bound=None if args.attn_mask == "no_mask" else 0,
+                dropout=dropout_tuple if is_dropout else None,
+            )
+
+        if run_bwd:
+            if args.data_type == "fp8":
+                o_fwd.set_output(True).set_dim(output.size()).set_stride(output.stride()).set_data_type(cudnn.data_type.FP8_E4M3)
+                (stats_fwd.set_output(True).set_dim(stats.size()).set_stride(stats.stride()).set_data_type(cudnn.data_type.FLOAT) if not is_infer else None)
+            else:
+                o_fwd.set_output(True).set_dim(output.size()).set_stride(output.stride())
+                (stats_fwd.set_output(True).set_dim(stats.size()).set_stride(stats.stride()).set_data_type(cudnn.data_type.FLOAT) if not is_infer else None)
+        else:
+            if args.data_type == "fp8":
+                o_fwd.set_output(True).set_dim(output.size()).set_stride(output.stride()).set_data_type(cudnn.data_type.FP8_E4M3)
+            else:
+                o_fwd.set_output(True).set_dim(output.size()).set_stride(output.stride())
+
+        if args.data_type == "fp8":
+            amax_s_fwd.set_output(True).set_dim(amax_s_gpu.size()).set_stride(amax_s_gpu.stride()).set_data_type(cudnn.data_type.FLOAT)
+            amax_o_fwd.set_output(True).set_dim(amax_o_gpu.size()).set_stride(amax_o_gpu.stride()).set_data_type(cudnn.data_type.FLOAT)
+        graph_fwd.validate()
+        graph_fwd.build_operation_graph()
+        graph_fwd.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph_fwd.check_support()
+        graph_fwd.build_plans()
+
+        # If backward is requested, set up backward graph.
+        if run_bwd:
+            graph_bwd = cudnn.pygraph(
+                io_data_type=(cudnn.data_type.FP8_E4M3 if args.data_type == "fp8" else convert_to_cudnn_type(target_dtype)),
+                intermediate_data_type=cudnn.data_type.FLOAT,
+                compute_data_type=cudnn.data_type.FLOAT,
+            )
+
+            stats_bwd = graph_bwd.tensor_like(stats)
+            if is_dropout:
+                seed_bwd = graph_bwd.tensor_like(dropout_seed)
+                offset_bwd = graph_bwd.tensor_like(dropout_offset)
+                dropout_tuple = (dropout_prob, seed_bwd, offset_bwd)
+
+            if args.data_type == "fp8":
+                q_bwd = graph_bwd.tensor_like(query).set_data_type(cudnn.data_type.FP8_E4M3)
+                k_bwd = graph_bwd.tensor_like(key).set_data_type(cudnn.data_type.FP8_E4M3)
+                v_bwd = graph_bwd.tensor_like(value).set_data_type(cudnn.data_type.FP8_E4M3)
+                o_bwd = graph_bwd.tensor_like(output).set_data_type(cudnn.data_type.FP8_E4M3)
+                dO_bwd = graph_bwd.tensor_like(dOutput).set_data_type(cudnn.data_type.FP8_E4M3)
+
+                descale_q_bwd = graph_bwd.tensor_like(descale_q_gpu)
+                descale_k_bwd = graph_bwd.tensor_like(descale_k_gpu)
+                descale_v_bwd = graph_bwd.tensor_like(descale_v_gpu)
+                descale_o_bwd = graph_bwd.tensor_like(descale_o_gpu)
+                descale_dO_bwd = graph_bwd.tensor_like(descale_dO_gpu)
+                descale_s_bwd = graph_bwd.tensor_like(descale_s_gpu)
+                descale_dP_bwd = graph_bwd.tensor_like(descale_dP_gpu)
+                scale_s_bwd = graph_bwd.tensor_like(scale_s_gpu)
+                scale_dQ_bwd = graph_bwd.tensor_like(scale_dQ_gpu)
+                scale_dK_bwd = graph_bwd.tensor_like(scale_dK_gpu)
+                scale_dV_bwd = graph_bwd.tensor_like(scale_dV_gpu)
+                scale_dP_bwd = graph_bwd.tensor_like(scale_dP_gpu)
+
+                (
+                    dQ_bwd,
+                    dK_bwd,
+                    dV_bwd,
+                    amax_dQ_bwd,
+                    amax_dK_bwd,
+                    amax_dV_bwd,
+                    amax_dP_bwd,
+                ) = graph_bwd.sdpa_fp8_backward(
+                    q=q_bwd,
+                    k=k_bwd,
+                    v=v_bwd,
+                    o=o_bwd,
+                    dO=dO_bwd,
+                    stats=stats_bwd,
+                    descale_q=descale_q_bwd,
+                    descale_k=descale_k_bwd,
+                    descale_v=descale_v_bwd,
+                    descale_o=descale_o_bwd,
+                    descale_dO=descale_dO_bwd,
+                    descale_s=descale_s_bwd,
+                    descale_dP=descale_dP_bwd,
+                    scale_s=scale_s_bwd,
+                    scale_dQ=scale_dQ_bwd,
+                    scale_dK=scale_dK_bwd,
+                    scale_dV=scale_dV_bwd,
+                    scale_dP=scale_dP_bwd,
+                    attn_scale=attn_scale,
+                    use_causal_mask=args.attn_mask != "no_mask" and args.attn_mask != "bottom_right",
+                    use_causal_mask_bottom_right=args.attn_mask == "bottom_right",
+                    dropout=dropout_tuple if is_dropout else None,
+                    use_deterministic_algorithm=args.deterministic_bwd,
+                )
+            else:
+                q_bwd = graph_bwd.tensor_like(query)
+                k_bwd = graph_bwd.tensor_like(key)
+                v_bwd = graph_bwd.tensor_like(value)
+                o_bwd = graph_bwd.tensor_like(output)
+                dO_bwd = graph_bwd.tensor_like(dOutput)
+
+                dQ_bwd, dK_bwd, dV_bwd = graph_bwd.sdpa_backward(
+                    q=q_bwd,
+                    k=k_bwd,
+                    v=v_bwd,
+                    o=o_bwd,
+                    dO=dO_bwd,
+                    stats=stats_bwd,
+                    attn_scale=attn_scale,
+                    diagonal_alignment=(cudnn.diagonal_alignment.BOTTOM_RIGHT if args.attn_mask == "bottom_right" else cudnn.diagonal_alignment.TOP_LEFT),
+                    diagonal_band_right_bound=None if args.attn_mask == "no_mask" else 0,
+                    dropout=dropout_tuple if is_dropout else None,
+                    use_deterministic_algorithm=args.deterministic_bwd,
+                )
+
+            if args.data_type == "fp8":
+                dQ_bwd.set_output(True).set_dim(dQuery.size()).set_stride(dQuery.stride()).set_data_type(cudnn.data_type.FP8_E4M3)
+                dK_bwd.set_output(True).set_dim(dKey.size()).set_stride(dKey.stride()).set_data_type(cudnn.data_type.FP8_E4M3)
+                dV_bwd.set_output(True).set_dim(dValue.size()).set_stride(dValue.stride()).set_data_type(cudnn.data_type.FP8_E4M3)
+                amax_dQ_bwd.set_output(True).set_dim(amax_dQ_gpu.size()).set_stride(amax_dQ_gpu.stride()).set_data_type(cudnn.data_type.FLOAT)
+                amax_dK_bwd.set_output(True).set_dim(amax_dK_gpu.size()).set_stride(amax_dK_gpu.stride()).set_data_type(cudnn.data_type.FLOAT)
+                amax_dV_bwd.set_output(True).set_dim(amax_dV_gpu.size()).set_stride(amax_dV_gpu.stride()).set_data_type(cudnn.data_type.FLOAT)
+                amax_dP_bwd.set_output(True).set_dim(amax_dP_gpu.size()).set_stride(amax_dP_gpu.stride()).set_data_type(cudnn.data_type.FLOAT)
+            else:
+                dQ_bwd.set_output(True).set_dim(dQuery.size()).set_stride(dQuery.stride())
+                dK_bwd.set_output(True).set_dim(dKey.size()).set_stride(dKey.stride())
+                dV_bwd.set_output(True).set_dim(dValue.size()).set_stride(dValue.stride())
+
+            graph_bwd.validate()
+            graph_bwd.build_operation_graph()
+            graph_bwd.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+            graph_bwd.check_support()
+            graph_bwd.build_plans()
+
+            if args.data_type == "fp8":
+                variant_pack_fwd = {
+                    q_fwd: query,
+                    k_fwd: key,
+                    v_fwd: value,
+                    o_fwd: output,
+                    stats_fwd: stats,
+                    descale_q_fwd: descale_q_gpu,
+                    descale_k_fwd: descale_k_gpu,
+                    descale_v_fwd: descale_v_gpu,
+                    descale_s_fwd: descale_s_gpu,
+                    scale_s_fwd: scale_s_gpu,
+                    scale_o_fwd: scale_o_gpu,
+                    amax_s_fwd: amax_s_gpu,
+                    amax_o_fwd: amax_o_gpu,
+                }
+
+                variant_pack_bwd = {
+                    q_bwd: query,
+                    k_bwd: key,
+                    v_bwd: value,
+                    o_bwd: output,
+                    dQ_bwd: dQuery,
+                    dK_bwd: dKey,
+                    dV_bwd: dValue,
+                    dO_bwd: dOutput,
+                    stats_bwd: stats,
+                    descale_q_bwd: descale_q_gpu,
+                    descale_k_bwd: descale_k_gpu,
+                    descale_v_bwd: descale_v_gpu,
+                    descale_o_bwd: descale_o_gpu,
+                    descale_s_bwd: descale_s_gpu,
+                    descale_dP_bwd: descale_dP_gpu,
+                    descale_dO_bwd: descale_dO_gpu,
+                    scale_s_bwd: scale_s_gpu,
+                    scale_dQ_bwd: scale_dQ_gpu,
+                    scale_dK_bwd: scale_dK_gpu,
+                    scale_dV_bwd: scale_dV_gpu,
+                    scale_dP_bwd: scale_dP_gpu,
+                    amax_dQ_bwd: amax_dQ_gpu,
+                    amax_dK_bwd: amax_dK_gpu,
+                    amax_dV_bwd: amax_dV_gpu,
+                    amax_dP_bwd: amax_dP_gpu,
+                }
+
+                workspace = torch.empty(
+                    max(graph_fwd.get_workspace_size(), graph_bwd.get_workspace_size()),
+                    device="cuda",
+                    dtype=torch.uint8,
+                )
+            else:
+                variant_pack_fwd = {
+                    q_fwd: query,
+                    k_fwd: key,
+                    v_fwd: value,
+                    o_fwd: output,
+                    stats_fwd: stats,
+                }
+                variant_pack_bwd = {
+                    q_bwd: query,
+                    k_bwd: key,
+                    v_bwd: value,
+                    o_bwd: output,
+                    dO_bwd: dOutput,
+                    stats_bwd: stats,
+                    dQ_bwd: dQuery,
+                    dK_bwd: dKey,
+                    dV_bwd: dValue,
+                }
+                workspace = torch.empty(
+                    max(graph_fwd.get_workspace_size(), graph_bwd.get_workspace_size()),
+                    device="cuda",
+                    dtype=torch.uint8,
+                )
+        else:
+            if args.data_type == "fp8":
+                variant_pack_fwd = {
+                    q_fwd: query,
+                    k_fwd: key,
+                    v_fwd: value,
+                    o_fwd: output,
+                    stats_fwd: stats,
+                    descale_q_fwd: descale_q_gpu,
+                    descale_k_fwd: descale_k_gpu,
+                    descale_v_fwd: descale_v_gpu,
+                    descale_s_fwd: descale_s_gpu,
+                    scale_s_fwd: scale_s_gpu,
+                    scale_o_fwd: scale_o_gpu,
+                    amax_s_fwd: amax_s_gpu,
+                    amax_o_fwd: amax_o_gpu,
+                }
+                workspace = torch.empty(graph_fwd.get_workspace_size(), device="cuda", dtype=torch.uint8)
+            else:
+                variant_pack_fwd = {
+                    q_fwd: query,
+                    k_fwd: key,
+                    v_fwd: value,
+                    o_fwd: output,
+                }
+                workspace = torch.empty(graph_fwd.get_workspace_size(), device="cuda", dtype=torch.uint8)
+        if is_dropout:
+            variant_pack_fwd[seed_fwd] = dropout_seed
+            variant_pack_fwd[offset_fwd] = dropout_offset
+            if run_bwd:
+                variant_pack_bwd[seed_bwd] = dropout_seed
+                variant_pack_bwd[offset_bwd] = dropout_offset
+    ## Done setting up cuDNN graph.
+
+    # For backends MATH, EFFICIENT_ATTENTION, CUDNN_ATTENTION, PYTORCH_FLASH_ATTENTION
+    def pyt_backend_sdpa(query, key, value, backend):
+        with sdpa_kernel(backends=[backend]):
+            return torch.nn.functional.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                enable_gqa=enable_gqa,
+                is_causal=args.attn_mask == "top_left",
+                attn_mask=causal_lower_right(q_seqlen, kv_seqlen) if args.attn_mask == "bottom_right" else None,
+            )
+
+    if args.sdpa_backend == "flash_attention":
+        import flash_attn
+        from flash_attn import flash_attn_func
+
+        # Flash Attention Native
+        def flash_attention_sdpa(query, key, value):
+            return flash_attn_func(query, key, value, causal=args.attn_mask != "no_mask")
+
+    if args.sdpa_backend == "flash_attention_3":
+        import flash_attn_interface
+
+        def flash_attention_3_sdpa(query, key, value):
+            output, _ = flash_attn_interface.flash_attn_func(query, key, value, causal=args.attn_mask != "no_mask")
+            return output
+
+    if args.sdpa_backend == "flash_attention_4" or (not args.skip_ref):
+        import flash_attn.cute.interface as flash_attn_interface
+
+        def flash_attention_4_sdpa(query, key, value):
+            output, _ = flash_attn_interface.flash_attn_func(query, key, value, causal=args.attn_mask != "no_mask")
+            return output
+
+    def get_sdpa_function(backend):
+        if backend == "pyt_math":
+            return functools.partial(pyt_backend_sdpa, backend=SDPBackend.MATH)
+        elif backend == "pyt_efficient_attention":
+            return functools.partial(pyt_backend_sdpa, backend=SDPBackend.EFFICIENT_ATTENTION)
+        elif backend == "pyt_flash_attention":
+            return functools.partial(pyt_backend_sdpa, backend=SDPBackend.FLASH_ATTENTION)
+        elif backend == "pyt_cudnn":
+            return functools.partial(pyt_backend_sdpa, backend=SDPBackend.CUDNN_ATTENTION)
+        elif backend == "flash_attention":
+            return flash_attention_sdpa
+        elif backend == "flash_attention_3":
+            return flash_attention_3_sdpa
+        elif backend == "flash_attention_4":
+            return flash_attention_4_sdpa
+        elif backend == "cudnn":
+            return None  # Will be set up separately
+        else:
+            raise ValueError(f"Invalid backend: {backend}")
+
+    # Util function for addressing different qkv formats for each backend
+    def preprocess_qkv(query, key, value, backend):
+        if backend.startswith("pyt_") or backend == "cudnn":
+            return query, key, value
+        elif backend.startswith("flash_attention"):
+            query = torch.swapaxes(query, 1, 2)
+            key = torch.swapaxes(key, 1, 2)
+            value = torch.swapaxes(value, 1, 2)
+            return query, key, value
+        else:
+            raise ValueError(f"Invalid backend: {backend}")
+
+    # Util function addressing different qkvo formats for each backend
+    def postprocess_qkvo(query, key, value, output, backend):
+        if backend.startswith("pyt_") or backend == "cudnn":
+            return query, key, value, output
+        elif backend.startswith("flash_attention"):
+            output = torch.swapaxes(output, 1, 2)
+            query = torch.swapaxes(query, 1, 2)
+            key = torch.swapaxes(key, 1, 2)
+            value = torch.swapaxes(value, 1, 2)
+            return query, key, value, output
+        else:
+            raise ValueError(f"Invalid backend: {backend}")
+
+    def postprocess_dqdkdvdo(dQuery, dKey, dValue, dOutput, backend):
+        if backend.startswith("pyt_") or backend == "cudnn":
+            return dQuery, dKey, dValue, dOutput
+        elif backend.startswith("flash_attention"):
+            dQuery = torch.swapaxes(dQuery, 1, 2)
+            dKey = torch.swapaxes(dKey, 1, 2)
+            dValue = torch.swapaxes(dValue, 1, 2)
+            dOutput = torch.swapaxes(dOutput, 1, 2)
+            return dQuery, dKey, dValue, dOutput
+        else:
+            raise ValueError(f"Invalid backend: {backend}")
+
+    # Util functions for calculating flops and tflops/s achieved
+    def flops(
+        batch_size,
+        q_seqlen,
+        kv_seqlen,
+        head_dim_qk,
+        head_dim_vo,
+        num_q_heads,
+        attn_mask,
+        mode="fwd",
+    ):
+        assert mode in ["fwd", "bwd", "fwd_bwd"]
+
+        if attn_mask == "no_mask":
+            num_nonmasked_elems = q_seqlen * kv_seqlen
+        elif attn_mask == "top_left":
+            num_nonmasked_elems = torch.tril(torch.ones((q_seqlen, kv_seqlen), dtype=torch.bool)).sum()
+        elif attn_mask == "bottom_right":
+            diagonal_offset = kv_seqlen - q_seqlen
+            num_nonmasked_elems = torch.tril(
+                torch.ones((q_seqlen, kv_seqlen), dtype=torch.bool),
+                diagonal=diagonal_offset,
+            ).sum()
+        # BMM FLOPs: 2 * M * N * K.
+        # Here, M*N = num_nonmasked_elems per head; add batch_size * num_q_heads multiplier.
+        # Forward: 2 BMMs => (1 x head_dim_qk) + (1 x head_dim_vo)
+        # Backward: 5 BMMs => (3 x head_dim_qk) + (2 x head_dim_vo)
+        base = batch_size * num_q_heads * num_nonmasked_elems * 2
+        if mode == "fwd":
+            result = base * (head_dim_qk + head_dim_vo)
+        elif mode == "bwd":
+            result = base * (3 * head_dim_qk + 2 * head_dim_vo)
+        else:  # fwd_bwd
+            result = base * (4 * head_dim_qk + 3 * head_dim_vo)
+        return result
+
+    def tflops_per_sec(
+        batch_size,
+        q_seqlen,
+        kv_seqlen,
+        head_dim_qk,
+        head_dim_vo,
+        num_q_heads,
+        attn_mask,
+        time,
+        mode="fwd",
+    ):
+        assert mode in ["fwd", "bwd", "fwd_bwd"]
+        f = flops(
+            batch_size,
+            q_seqlen,
+            kv_seqlen,
+            head_dim_qk,
+            head_dim_vo,
+            num_q_heads,
+            attn_mask,
+            mode,
+        )
+        return f / time / 1e9 if not math.isnan(time) else 0.0  # Assume time is in msec
+
+    ###### Done setting up SDPA function for each backend #######
+    #############################################################
+
+    ###### SDPA Benchmark -- Run ######
+    ## Print System Info
+    if args.verbose:
+        print(f"[INFO] {torch.__version__ = }")
+        print(f"[INFO] {torch.version.cuda = }")
+        print(f"[INFO] {torch.cuda.is_available() = }")
+        print(f"[INFO] {torch.cuda.device_count() = }")
+        print(f"[INFO] {torch.cuda.current_device() = }")
+        print(f"[INFO] {torch.cuda.get_device_name(torch.cuda.current_device()) = }")
+        if args.sdpa_backend == "pyt_cudnn":
+            print(f"[INFO] {torch.backends.cudnn.version() = }")
+            print(f"[INFO] {torch.backends.cudnn.enabled = }")
+        elif args.sdpa_backend == "flash_attention":
+            print(f"[INFO] {flash_attn.__version__ = }")
+
+    forward_times = []
+    backward_times = []
+    forward_diffs = []
+
+    total_iters = num_iters + dry_run_iters
+
+    first_error = True  # For suppressing error message beyond first error
+    sdpa_function = get_sdpa_function(args.sdpa_backend)
+    for i in range(total_iters):
+        if args.data_type == "fp8" and args.sdpa_backend == "cudnn":
+            query = torch.randint(
+                256,
+                (batch_size, q_seqlen, num_q_heads, head_dim_qk),
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+            key = torch.randint(
+                256,
+                (batch_size, kv_seqlen, num_kv_heads, head_dim_qk),
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+            value = torch.randint(
+                256,
+                (batch_size, kv_seqlen, num_kv_heads, head_dim_vo),
+                dtype=torch.uint8,
+                device=device,
+            ).transpose(1, 2)
+            descale_q_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_k_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_v_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_s_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_o_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_dO_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            descale_dP_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_s_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_o_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dQ_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dK_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dV_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            scale_dP_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_s_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_o_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dQ_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dK_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dV_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+            amax_dP_gpu = torch.zeros(1, 1, 1, 1, dtype=torch.float, device=device)
+        elif args.data_type == "fp8" and args.sdpa_backend == "flash_attention_3":
+            query = (
+                torch.randn(
+                    batch_size,
+                    q_seqlen,
+                    num_q_heads,
+                    head_dim_qk,
+                    dtype=torch.bfloat16,
+                    device=device,
+                    requires_grad=True,
+                )
+                .to(torch.float8_e4m3fn)
+                .transpose(1, 2)
+            )
+            key = (
+                torch.randn(
+                    batch_size,
+                    kv_seqlen,
+                    num_kv_heads,
+                    head_dim_qk,
+                    dtype=torch.bfloat16,
+                    device=device,
+                    requires_grad=True,
+                )
+                .to(torch.float8_e4m3fn)
+                .transpose(1, 2)
+            )
+            value = (
+                torch.randn(
+                    batch_size,
+                    kv_seqlen,
+                    num_kv_heads,
+                    head_dim_vo,
+                    dtype=torch.bfloat16,
+                    device=device,
+                    requires_grad=True,
+                )
+                .to(torch.float8_e4m3fn)
+                .transpose(1, 2)
+            )
+        else:
+            query = torch.randn(
+                batch_size,
+                q_seqlen,
+                num_q_heads,
+                head_dim_qk,
+                dtype=target_dtype,
+                device=device,
+                requires_grad=True,
+            ).transpose(1, 2)
+            key = torch.randn(
+                batch_size,
+                kv_seqlen,
+                num_kv_heads,
+                head_dim_qk,
+                dtype=target_dtype,
+                device=device,
+                requires_grad=True,
+            ).transpose(1, 2)
+            value = torch.randn(
+                batch_size,
+                kv_seqlen,
+                num_kv_heads,
+                head_dim_vo,
+                dtype=target_dtype,
+                device=device,
+                requires_grad=True,
+            ).transpose(1, 2)
+
+        query, key, value = preprocess_qkv(query, key, value, args.sdpa_backend)
+        if args.data_type == "fp8" and args.sdpa_backend == "cudnn":
+            # Create as bfloat16, convert to FP8, then view as uint8 to avoid DLPack issues
+            dOutput_bf16 = torch.randn(query.shape, dtype=torch.bfloat16, device=device)
+            dOutput_fp8 = dOutput_bf16.to(torch.float8_e4m3fn)
+            dOutput = dOutput_fp8.view(torch.uint8)
+        else:
+            dOutput = torch.randn_like(query)
+
+        if args.sdpa_backend == "cudnn":
+            output = torch.empty(
+                batch_size,
+                q_seqlen,
+                num_q_heads,
+                head_dim_vo,
+                dtype=torch.uint8 if args.data_type == "fp8" else target_dtype,
+                device=device,
+            ).transpose(1, 2)
+            dQuery = torch.empty_like(query)
+            dKey = torch.empty_like(key)
+            dValue = torch.empty_like(value)
+            stats = torch.randn(batch_size, q_seqlen, num_q_heads, 1, dtype=torch.float32, device=device).transpose(1, 2)
+            if is_dropout:
+                dropout_seed = torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda")
+                dropout_offset = torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda")
+
+            # Only variant pack and workspace need to be updated for each iteration.
+            if run_bwd:
+                if args.data_type == "fp8":
+                    variant_pack_fwd = {
+                        q_fwd: query,
+                        k_fwd: key,
+                        v_fwd: value,
+                        o_fwd: output,
+                        stats_fwd: stats,
+                        descale_q_fwd: descale_q_gpu,
+                        descale_k_fwd: descale_k_gpu,
+                        descale_v_fwd: descale_v_gpu,
+                        descale_s_fwd: descale_s_gpu,
+                        scale_s_fwd: scale_s_gpu,
+                        scale_o_fwd: scale_o_gpu,
+                        amax_s_fwd: amax_s_gpu,
+                        amax_o_fwd: amax_o_gpu,
+                    }
+                    variant_pack_bwd = {
+                        q_bwd: query,
+                        k_bwd: key,
+                        v_bwd: value,
+                        o_bwd: output,
+                        dQ_bwd: dQuery,
+                        dK_bwd: dKey,
+                        dV_bwd: dValue,
+                        dO_bwd: dOutput,
+                        stats_bwd: stats,
+                        descale_q_bwd: descale_q_gpu,
+                        descale_k_bwd: descale_k_gpu,
+                        descale_v_bwd: descale_v_gpu,
+                        descale_o_bwd: descale_o_gpu,
+                        descale_s_bwd: descale_s_gpu,
+                        descale_dP_bwd: descale_dP_gpu,
+                        descale_dO_bwd: descale_dO_gpu,
+                        scale_s_bwd: scale_s_gpu,
+                        scale_dQ_bwd: scale_dQ_gpu,
+                        scale_dK_bwd: scale_dK_gpu,
+                        scale_dV_bwd: scale_dV_gpu,
+                        scale_dP_bwd: scale_dP_gpu,
+                        amax_dQ_bwd: amax_dQ_gpu,
+                        amax_dK_bwd: amax_dK_gpu,
+                        amax_dV_bwd: amax_dV_gpu,
+                        amax_dP_bwd: amax_dP_gpu,
+                    }
+                else:
+                    variant_pack_fwd = {
+                        q_fwd: query,
+                        k_fwd: key,
+                        v_fwd: value,
+                        o_fwd: output,
+                        stats_fwd: stats,
+                    }
+                    variant_pack_bwd = {
+                        q_bwd: query,
+                        k_bwd: key,
+                        v_bwd: value,
+                        o_bwd: output,
+                        dO_bwd: dOutput,
+                        stats_bwd: stats,
+                        dQ_bwd: dQuery,
+                        dK_bwd: dKey,
+                        dV_bwd: dValue,
+                    }
+                workspace = torch.empty(
+                    max(graph_fwd.get_workspace_size(), graph_bwd.get_workspace_size()),
+                    device="cuda",
+                    dtype=torch.uint8,
+                )
+            else:
+                if args.data_type == "fp8":
+                    variant_pack_fwd = {
+                        q_fwd: query,
+                        k_fwd: key,
+                        v_fwd: value,
+                        o_fwd: output,
+                        stats_fwd: stats,
+                        descale_q_fwd: descale_q_gpu,
+                        descale_k_fwd: descale_k_gpu,
+                        descale_v_fwd: descale_v_gpu,
+                        descale_s_fwd: descale_s_gpu,
+                        scale_s_fwd: scale_s_gpu,
+                        scale_o_fwd: scale_o_gpu,
+                        amax_s_fwd: amax_s_gpu,
+                        amax_o_fwd: amax_o_gpu,
+                    }
+                else:
+                    variant_pack_fwd = {
+                        q_fwd: query,
+                        k_fwd: key,
+                        v_fwd: value,
+                        o_fwd: output,
+                    }
+                workspace = torch.empty(graph_fwd.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+            if is_dropout:
+                variant_pack_fwd[seed_fwd] = dropout_seed
+                variant_pack_fwd[offset_fwd] = dropout_offset
+                if run_bwd:
+                    variant_pack_bwd[seed_bwd] = dropout_seed
+                    variant_pack_bwd[offset_bwd] = dropout_offset
+
+        l2_flush_buffer.zero_()
+
+        # Run kernel with profiler for forward if requested, else run unprofiled to prep for backward
+        if run_fwd:
+            with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
+                with record_function("sdpa.forward"):  # Custom marker
+                    if args.sdpa_backend == "cudnn":
+                        graph_fwd.execute(variant_pack_fwd, workspace)
+                    else:
+                        output = sdpa_function(query, key, value)
+                torch.cuda.synchronize()  # Ensure all kernels finish
+
+            # Filter profiler results by kernel name prefix
+            matched_kernels = [
+                item
+                for item in prof.key_averages()
+                if item.key.startswith("cudnn")
+                or item.key.startswith("kernel_cutlass")
+                or "pytorch_flash::" in item.key
+                or "flash::" in item.key
+                or "at::native::" in item.key
+                or "cutlass3x" in item.key
+                or "(anonymous namespace)::" in item.key
+                or item.key.startswith("fmha_")
+            ]
+            if len(matched_kernels) >= 1:
+                fwd_time = sum(item.device_time for item in matched_kernels) / 1000
+                if i >= dry_run_iters:
+                    forward_times.append(fwd_time)
+        else:
+            if args.sdpa_backend == "cudnn":
+                graph_fwd.execute(variant_pack_fwd, workspace)
+            else:
+                output = sdpa_function(query, key, value)
+            torch.cuda.synchronize()
+
+        # Sleep for some time proportional to fwd_time for stable measurements
+        sleep_time = np.min([fwd_time / 100, 1.0]) if run_fwd and len(matched_kernels) >= 1 else 0.0
+        time.sleep(sleep_time)
+
+        if run_bwd:
+            # Run backward pass
+
+            l2_flush_buffer.zero_()
+
+            with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
+                with record_function("sdpa.backward"):  # Custom marker
+                    if args.sdpa_backend == "cudnn":
+                        graph_bwd.execute(variant_pack_bwd, workspace)
+                    else:
+                        query.retain_grad()
+                        key.retain_grad()
+                        value.retain_grad()
+                        output.backward(dOutput)
+
+                        dQuery = query.grad
+                        dKey = key.grad
+                        dValue = value.grad
+
+                        query.grad = None
+                        key.grad = None
+                        value.grad = None
+                torch.cuda.synchronize()
+
+            matched_kernels = [
+                item
+                for item in prof.key_averages()
+                if "cudnn" in item.key
+                or item.key.startswith("kernel_cutlass")
+                or "pytorch_flash::" in item.key
+                or "flash::" in item.key
+                or "at::native::" in item.key
+                or "cutlass3x" in item.key
+                or "(anonymous namespace)::" in item.key
+                or item.key.startswith("fmha_")
+            ]
+            if len(matched_kernels) >= 1:
+                bwd_time = sum(item.device_time for item in matched_kernels) / 1000
+                if i >= dry_run_iters:
+                    backward_times.append(bwd_time)
+
+            sleep_time = np.min([bwd_time / 100, 1.0]) if run_bwd and len(matched_kernels) >= 1 else 0.0
+            time.sleep(sleep_time)
+
+            dQuery, dKey, dValue, dOutput = postprocess_dqdkdvdo(dQuery, dKey, dValue, dOutput, args.sdpa_backend)
+
+        (
+            query,
+            key,
+            value,
+            output,
+        ) = postprocess_qkvo(query, key, value, output, args.sdpa_backend)
+        if args.data_type != "fp8" and not args.skip_ref and run_fwd:
+            try:
+                output_ref = flash_attention_4_sdpa(query, key, value)
+                if run_bwd:
+                    query.retain_grad()
+                    key.retain_grad()
+                    value.retain_grad()
+                    output_ref.backward(dOutput)
+
+                    torch.testing.assert_close(dQuery, query.grad, rtol=2e-2, atol=2e-2)
+                    torch.testing.assert_close(dKey, key.grad, rtol=2e-2, atol=2e-2)
+                    torch.testing.assert_close(dValue, value.grad, rtol=2e-2, atol=2e-2)
+
+                torch.testing.assert_close(output, output_ref, rtol=1e-2, atol=1e-2)
+                forward_diffs.append(torch.max(torch.abs(output.detach() - output_ref.detach())).item())
+            except Exception as e:
+                if first_error:
+                    print(
+                        f"[WARN] Failed reference check. Target backend has been run, but output has not been validated. Failure may be due to incorrect output or reference function failure."
+                    )
+                    print(f"[WARN] See error message: {e}")
+                    first_error = False
+                forward_diffs.append(0.0)
+        else:
+            forward_diffs.append(0.0)
+
+        time.sleep(sleep_time)
+
+        if args.sdpa_backend == "cudnn":
+            del query, key, value, output, dQuery, dKey, dValue, dOutput, stats
+        else:
+            del query, key, value, output
+
+    ## print results
+    fwd_median_time = (
+        np.median(np.array(forward_times[5:])) if len(forward_times) > 5 else (np.median(np.array(forward_times)) if len(forward_times) > 0 else 0.0)
+    )
+    fwd_tflops = 0.0
+    if run_fwd and fwd_median_time > 0:
+        fwd_tflops = tflops_per_sec(
+            args.batch_size,
+            args.q_seqlen,
+            args.kv_seqlen,
+            head_dim_qk,
+            head_dim_vo,
+            args.num_q_heads,
+            args.attn_mask,
+            fwd_median_time,
+            "fwd",
+        )
+
+    bwd_median_time = (
+        np.median(np.array(backward_times[5:])) if len(backward_times) > 5 else (np.median(np.array(backward_times)) if len(backward_times) > 0 else 0.0)
+    )
+    bwd_tflops = 0.0
+    if run_bwd and bwd_median_time > 0:
+        bwd_tflops = tflops_per_sec(
+            args.batch_size,
+            args.q_seqlen,
+            args.kv_seqlen,
+            head_dim_qk,
+            head_dim_vo,
+            args.num_q_heads,
+            args.attn_mask,
+            bwd_median_time,
+            "bwd",
+        )
+
+    if args.format_output:
+        print(
+            f"{args.case_tag},{args.sdpa_backend},{args.batch_size},{args.q_seqlen},{args.kv_seqlen},{args.num_q_heads},{args.num_kv_heads},{head_dim_qk},{fwd_median_time:.3f},{bwd_median_time:.3f},{fwd_tflops:.0f},{bwd_tflops:.0f},{(np.max(np.array(forward_diffs[5:])) if len(forward_diffs) > 5 else (np.max(np.array(forward_diffs)) if len(forward_diffs) > 0 else 0.0)):.6f},{num_iters}"
+        )
+    else:
+        if run_fwd and run_bwd:
+            print(
+                f"{args.sdpa_backend}:: Median (fwd, bwd) Execution Times: {fwd_median_time:.3f} ms ({fwd_tflops:.0f} TFLOPS), {bwd_median_time:.3f} ms ({bwd_tflops:.0f} TFLOPS)"
+            )
+        elif run_fwd:
+            print(f"{args.sdpa_backend}:: Median (fwd) Execution Time: {fwd_median_time:.3f} ms ({fwd_tflops:.0f} TFLOPS)")
+        elif run_bwd:
+            print(f"{args.sdpa_backend}:: Median (bwd) Execution Time: {bwd_median_time:.3f} ms ({bwd_tflops:.0f} TFLOPS)")
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/charts.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/charts.py
new file mode 100644
index 00000000..7c9872d5
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/charts.py
@@ -0,0 +1,441 @@
+"""
+Chart generation for SDPA benchmark results.
+
+Generates comparison bar charts showing backend performance side-by-side.
+"""
+
+from pathlib import Path
+from typing import Optional, TYPE_CHECKING
+import logging
+
+if TYPE_CHECKING:
+    import pandas as pd
+    from .config_types import BenchmarkConfig
+
+logger = logging.getLogger(__name__)
+
+# Backend display configuration
+# Each backend has a base color; FP8 variants get a darker/different shade
+BACKEND_CONFIG = {
+    "cudnn": {"name": "cudnn", "color": "#76b900", "color_fp8": "#4a7500", "order": 0},
+    "pyt_cudnn": {"name": "cuDNN (PyTorch)", "color": "#90EE90", "color_fp8": "#228B22", "order": 1},
+    "pyt_flash_attention": {"name": "FAv2 (PyTorch)", "color": "#6495ED", "color_fp8": "#0000CD", "order": 2},
+    "pyt_efficient_attention": {"name": "xFormers (PyTorch)", "color": "#FF00FF", "color_fp8": "#8B008B", "order": 3},
+    "pyt_math": {"name": "Standard Attention", "color": "#FF8C00", "color_fp8": "#D2691E", "order": 4},
+    "flash_attention": {"name": "FAv2 (Native)", "color": "#F08080", "color_fp8": "#CD5C5C", "order": 5},
+    "flash_attention_3": {"name": "FAv3", "color": "#FFA500", "color_fp8": "#FF6600", "order": 6},
+    "flash_attention_4": {"name": "FAv4", "color": "#FFD700", "color_fp8": "#DAA520", "order": 7},
+}
+
+# Font sizes for plot elements
+LABEL_FONT_SIZE = 10
+LEGEND_FONT_SIZE = 8
+TITLE_FONT_SIZE = 12
+BAR_LABEL_FONT_SIZE = 6
+
+
+def get_backend_display_name(backend: str, data_type: str) -> str:
+    """
+    Get display name for backend+dtype combination.
+
+    Args:
+        backend: Backend name (e.g., "cudnn")
+        data_type: Data type (e.g., "bfloat16", "fp8")
+
+    Returns:
+        Display name for legend (e.g., "cuDNN FE (FP8)")
+    """
+    base_name = BACKEND_CONFIG.get(backend, {}).get("name", backend)
+    if data_type == "fp8":
+        return f"{base_name} (FP8)"
+    elif data_type == "float16":
+        return f"{base_name} (FP16)"
+    return base_name
+
+
+def get_backend_color(backend: str, data_type: str) -> str:
+    """
+    Get color for backend+dtype combination.
+
+    Args:
+        backend: Backend name
+        data_type: Data type
+
+    Returns:
+        Color string for matplotlib
+    """
+    config = BACKEND_CONFIG.get(backend, {})
+    if data_type == "fp8" and "color_fp8" in config:
+        return config["color_fp8"]
+    return config.get("color", "gray")
+
+
+def generate_comparison_chart(
+    df: "pd.DataFrame",
+    config: "BenchmarkConfig",
+    output_path: Optional[Path] = None,
+) -> Path:
+    """
+    Generate comparison bar chart with multiple backends side-by-side.
+
+    Creates a figure with:
+    - Left subplot: Forward pass TFLOPS by configuration
+    - Right subplot: Backward pass TFLOPS by configuration
+    - Each backend+dtype combo as a separate bar group
+
+    Args:
+        df: DataFrame with benchmark results (from BenchmarkRunner.results_to_dataframe)
+        config: BenchmarkConfig used for the run
+        output_path: Optional path for output file. If None, uses config.output_dir
+
+    Returns:
+        Path to the saved chart file
+    """
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    import numpy as np
+
+    # Filter to successful results only
+    df = df[df["success"] == True].copy()
+
+    if df.empty:
+        raise ValueError("No successful results to plot")
+
+    # Create backend+dtype display name for legend
+    df["backend_display"] = df.apply(lambda r: get_backend_display_name(r["backend"], r["data_type"]), axis=1)
+
+    # Create config label for x-axis (model/seqlen/mask)
+    df["config_label"] = df.apply(
+        lambda r: f"{r['model_name']}\n{r['q_seqlen']}x{r['kv_seqlen']}\n{r['attn_mask']}",
+        axis=1,
+    )
+
+    # Sort by backend order for consistent legend
+    df["backend_order"] = df["backend"].map(lambda b: BACKEND_CONFIG.get(b, {}).get("order", 99))
+    df.sort_values(["model_name", "q_seqlen", "attn_mask", "backend_order"], inplace=True)
+
+    # Build color palette based on unique backend+dtype combinations
+    # Get unique (backend, data_type, backend_display) tuples to map colors correctly
+    unique_combos = df[["backend", "data_type", "backend_display"]].drop_duplicates()
+    palette = {}
+    for _, row in unique_combos.iterrows():
+        palette[row["backend_display"]] = get_backend_color(row["backend"], row["data_type"])
+
+    # Determine if we have fwd/bwd data
+    has_fwd = (df["fwd_tflops"] > 0).any()
+    has_bwd = (df["bwd_tflops"] > 0).any()
+
+    if has_fwd and has_bwd:
+        fig, axes = plt.subplots(1, 2, figsize=(14, 6), dpi=150)
+        ax_fwd, ax_bwd = axes
+    elif has_fwd:
+        fig, ax_fwd = plt.subplots(1, 1, figsize=(10, 6), dpi=150)
+        ax_bwd = None
+    elif has_bwd:
+        fig, ax_bwd = plt.subplots(1, 1, figsize=(10, 6), dpi=150)
+        ax_fwd = None
+    else:
+        raise ValueError("No forward or backward TFLOPS data to plot")
+
+    # Calculate y-axis limit
+    max_tflops = max(
+        df["fwd_tflops"].max() if has_fwd else 0,
+        df["bwd_tflops"].max() if has_bwd else 0,
+    )
+    ylim_max = max_tflops * 1.15  # Add 15% headroom for labels
+
+    # Plot forward pass
+    if ax_fwd is not None:
+        fwd_df = df[df["fwd_tflops"] > 0]
+        if not fwd_df.empty:
+            sns.barplot(
+                data=fwd_df,
+                x="config_label",
+                y="fwd_tflops",
+                hue="backend_display",
+                ax=ax_fwd,
+                palette=palette,
+                edgecolor="black",
+                linewidth=0.5,
+            )
+            ax_fwd.set_xlabel("Configuration", fontsize=LABEL_FONT_SIZE)
+            ax_fwd.set_ylabel("TFLOPS", fontsize=LABEL_FONT_SIZE)
+            ax_fwd.set_title("SDPA Forward Pass", fontsize=TITLE_FONT_SIZE)
+            ax_fwd.legend(title="Backend", fontsize=LEGEND_FONT_SIZE)
+            ax_fwd.tick_params(axis="x", rotation=45, labelsize=8)
+            ax_fwd.tick_params(axis="y", labelsize=LABEL_FONT_SIZE)
+            ax_fwd.set_ylim(0, ylim_max)
+
+            # Add value labels on bars
+            for container in ax_fwd.containers:
+                ax_fwd.bar_label(container, fmt="%.0f", fontsize=BAR_LABEL_FONT_SIZE)
+
+    # Plot backward pass
+    if ax_bwd is not None:
+        bwd_df = df[df["bwd_tflops"] > 0]
+        if not bwd_df.empty:
+            sns.barplot(
+                data=bwd_df,
+                x="config_label",
+                y="bwd_tflops",
+                hue="backend_display",
+                ax=ax_bwd,
+                palette=palette,
+                edgecolor="black",
+                linewidth=0.5,
+            )
+            ax_bwd.set_xlabel("Configuration", fontsize=LABEL_FONT_SIZE)
+            ax_bwd.set_ylabel("TFLOPS", fontsize=LABEL_FONT_SIZE)
+            ax_bwd.set_title("SDPA Backward Pass", fontsize=TITLE_FONT_SIZE)
+            ax_bwd.legend(title="Backend", fontsize=LEGEND_FONT_SIZE)
+            ax_bwd.tick_params(axis="x", rotation=45, labelsize=8)
+            ax_bwd.tick_params(axis="y", labelsize=LABEL_FONT_SIZE)
+            ax_bwd.set_ylim(0, ylim_max)
+
+            # Add value labels on bars
+            for container in ax_bwd.containers:
+                ax_bwd.bar_label(container, fmt="%.0f", fontsize=BAR_LABEL_FONT_SIZE)
+
+    plt.tight_layout()
+
+    # Determine output path
+    if output_path is None:
+        output_dir = Path(config.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        output_path = output_dir / f"{config.name}_comparison.png"
+
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    plt.close()
+
+    logger.info(f"Chart saved to {output_path}")
+    return output_path
+
+
+def generate_charts_by_mask(
+    df: "pd.DataFrame",
+    config: "BenchmarkConfig",
+    output_dir: Optional[Path] = None,
+) -> list:
+    """
+    Generate separate charts for each mask type.
+
+    This creates cleaner charts when benchmarking both causal and non-causal masks.
+    Each chart shows seqlen on x-axis and backends as grouped bars.
+
+    Args:
+        df: DataFrame with benchmark results
+        config: BenchmarkConfig used for the run
+        output_dir: Directory for output files
+
+    Returns:
+        List of paths to saved chart files
+    """
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    df = df[df["success"] == True].copy()
+
+    if df.empty:
+        raise ValueError("No successful results to plot")
+
+    if output_dir is None:
+        output_dir = Path(config.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    saved_paths = []
+    masks = df["attn_mask"].unique()
+
+    for mask in masks:
+        mask_df = df[df["attn_mask"] == mask].copy()
+
+        # Create display names
+        mask_df["backend_display"] = mask_df.apply(lambda r: get_backend_display_name(r["backend"], r["data_type"]), axis=1)
+        mask_df["seqlen_label"] = mask_df.apply(lambda r: f"{r['q_seqlen']}x{r['kv_seqlen']}", axis=1)
+
+        # Build palette
+        unique_combos = mask_df[["backend", "data_type", "backend_display"]].drop_duplicates()
+        palette = {}
+        for _, row in unique_combos.iterrows():
+            palette[row["backend_display"]] = get_backend_color(row["backend"], row["data_type"])
+
+        # Sort
+        mask_df["backend_order"] = mask_df["backend"].map(lambda b: BACKEND_CONFIG.get(b, {}).get("order", 99))
+        mask_df.sort_values(["q_seqlen", "backend_order"], inplace=True)
+
+        has_fwd = (mask_df["fwd_tflops"] > 0).any()
+        has_bwd = (mask_df["bwd_tflops"] > 0).any()
+
+        if has_fwd and has_bwd:
+            fig, (ax_fwd, ax_bwd) = plt.subplots(1, 2, figsize=(14, 6), dpi=150)
+        elif has_fwd:
+            fig, ax_fwd = plt.subplots(1, 1, figsize=(10, 6), dpi=150)
+            ax_bwd = None
+        else:
+            fig, ax_bwd = plt.subplots(1, 1, figsize=(10, 6), dpi=150)
+            ax_fwd = None
+
+        mask_title = "Causal" if mask == "top_left" else "Non-Causal" if mask == "no_mask" else mask
+
+        if ax_fwd is not None:
+            fwd_df = mask_df[mask_df["fwd_tflops"] > 0]
+            if not fwd_df.empty:
+                sns.barplot(
+                    data=fwd_df,
+                    x="seqlen_label",
+                    y="fwd_tflops",
+                    hue="backend_display",
+                    ax=ax_fwd,
+                    palette=palette,
+                    edgecolor="black",
+                    linewidth=0.5,
+                )
+                ax_fwd.set_xlabel("Sequence Length", fontsize=LABEL_FONT_SIZE)
+                ax_fwd.set_ylabel("TFLOPS", fontsize=LABEL_FONT_SIZE)
+                ax_fwd.set_title(f"{config.name} Forward ({mask_title})", fontsize=TITLE_FONT_SIZE)
+                ax_fwd.legend(title="Backend", fontsize=LEGEND_FONT_SIZE)
+                ax_fwd.tick_params(axis="x", rotation=45)
+                for container in ax_fwd.containers:
+                    ax_fwd.bar_label(container, fmt="%.0f", fontsize=BAR_LABEL_FONT_SIZE)
+
+        if ax_bwd is not None:
+            bwd_df = mask_df[mask_df["bwd_tflops"] > 0]
+            if not bwd_df.empty:
+                sns.barplot(
+                    data=bwd_df,
+                    x="seqlen_label",
+                    y="bwd_tflops",
+                    hue="backend_display",
+                    ax=ax_bwd,
+                    palette=palette,
+                    edgecolor="black",
+                    linewidth=0.5,
+                )
+                ax_bwd.set_xlabel("Sequence Length", fontsize=LABEL_FONT_SIZE)
+                ax_bwd.set_ylabel("TFLOPS", fontsize=LABEL_FONT_SIZE)
+                ax_bwd.set_title(f"{config.name} Backward ({mask_title})", fontsize=TITLE_FONT_SIZE)
+                ax_bwd.legend(title="Backend", fontsize=LEGEND_FONT_SIZE)
+                ax_bwd.tick_params(axis="x", rotation=45)
+                for container in ax_bwd.containers:
+                    ax_bwd.bar_label(container, fmt="%.0f", fontsize=BAR_LABEL_FONT_SIZE)
+
+        plt.tight_layout()
+        output_path = output_dir / f"{config.name}_{mask}.png"
+        plt.savefig(output_path, dpi=150, bbox_inches="tight")
+        plt.close()
+        saved_paths.append(output_path)
+        logger.info(f"Chart saved to {output_path}")
+
+    return saved_paths
+
+
+def generate_seqlen_scaling_chart(
+    df: "pd.DataFrame",
+    config: "BenchmarkConfig",
+    output_path: Optional[Path] = None,
+) -> Path:
+    """
+    Generate a chart showing performance scaling with sequence length.
+
+    This chart is useful when benchmarking multiple sequence lengths with
+    the same model configuration.
+
+    Args:
+        df: DataFrame with benchmark results
+        config: BenchmarkConfig used for the run
+        output_path: Optional path for output file
+
+    Returns:
+        Path to the saved chart file
+    """
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+
+    # Filter to successful results only
+    df = df[df["success"] == True].copy()
+
+    if df.empty:
+        raise ValueError("No successful results to plot")
+
+    # Create backend+dtype display name
+    df["backend_display"] = df.apply(lambda r: get_backend_display_name(r["backend"], r["data_type"]), axis=1)
+
+    # Use q_seqlen for x-axis (assuming symmetric seqlens for this chart)
+    df["seqlen"] = df["q_seqlen"]
+
+    # Build color palette based on unique backend+dtype combinations
+    unique_combos = df[["backend", "data_type", "backend_display"]].drop_duplicates()
+    palette = {}
+    for _, row in unique_combos.iterrows():
+        palette[row["backend_display"]] = get_backend_color(row["backend"], row["data_type"])
+
+    # Create figure
+    has_fwd = (df["fwd_tflops"] > 0).any()
+    has_bwd = (df["bwd_tflops"] > 0).any()
+
+    if has_fwd and has_bwd:
+        fig, axes = plt.subplots(1, 2, figsize=(14, 6), dpi=150)
+        ax_fwd, ax_bwd = axes
+    elif has_fwd:
+        fig, ax_fwd = plt.subplots(1, 1, figsize=(10, 6), dpi=150)
+        ax_bwd = None
+    else:
+        fig, ax_bwd = plt.subplots(1, 1, figsize=(10, 6), dpi=150)
+        ax_fwd = None
+
+    # Plot forward
+    if ax_fwd is not None and has_fwd:
+        fwd_df = df[df["fwd_tflops"] > 0]
+        sns.barplot(
+            data=fwd_df,
+            x="seqlen",
+            y="fwd_tflops",
+            hue="backend_display",
+            ax=ax_fwd,
+            palette=palette,
+            edgecolor="black",
+            linewidth=0.5,
+        )
+        ax_fwd.set_xlabel("Sequence Length", fontsize=LABEL_FONT_SIZE)
+        ax_fwd.set_ylabel("TFLOPS", fontsize=LABEL_FONT_SIZE)
+        ax_fwd.set_title("SDPA Forward Pass", fontsize=TITLE_FONT_SIZE)
+        ax_fwd.legend(title="Backend", fontsize=LEGEND_FONT_SIZE)
+        ax_fwd.tick_params(axis="x", rotation=45)
+
+        for container in ax_fwd.containers:
+            ax_fwd.bar_label(container, fmt="%.0f", fontsize=BAR_LABEL_FONT_SIZE)
+
+    # Plot backward
+    if ax_bwd is not None and has_bwd:
+        bwd_df = df[df["bwd_tflops"] > 0]
+        sns.barplot(
+            data=bwd_df,
+            x="seqlen",
+            y="bwd_tflops",
+            hue="backend_display",
+            ax=ax_bwd,
+            palette=palette,
+            edgecolor="black",
+            linewidth=0.5,
+        )
+        ax_bwd.set_xlabel("Sequence Length", fontsize=LABEL_FONT_SIZE)
+        ax_bwd.set_ylabel("TFLOPS", fontsize=LABEL_FONT_SIZE)
+        ax_bwd.set_title("SDPA Backward Pass", fontsize=TITLE_FONT_SIZE)
+        ax_bwd.legend(title="Backend", fontsize=LEGEND_FONT_SIZE)
+        ax_bwd.tick_params(axis="x", rotation=45)
+
+        for container in ax_bwd.containers:
+            ax_bwd.bar_label(container, fmt="%.0f", fontsize=BAR_LABEL_FONT_SIZE)
+
+    plt.tight_layout()
+
+    # Determine output path
+    if output_path is None:
+        output_dir = Path(config.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        output_path = output_dir / f"{config.name}_seqlen_scaling.png"
+
+    plt.savefig(output_path, dpi=150, bbox_inches="tight")
+    plt.close()
+
+    logger.info(f"Chart saved to {output_path}")
+    return output_path
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/config_types.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/config_types.py
new file mode 100644
index 00000000..ee9df6e5
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/config_types.py
@@ -0,0 +1,183 @@
+"""
+Core types for the SDPA benchmark configuration system.
+
+This module defines the dataclasses used to configure and collect results
+from SDPA benchmarks.
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional, List, Tuple
+
+
+@dataclass
+class ModelPreset:
+    """
+    Represents a named model configuration preset.
+
+    Defines the attention head configuration for a specific model architecture.
+    Can use either symmetric head dimensions (head_dim) or asymmetric
+    (head_dim_qk, head_dim_vo) for models like DeepSeek V3.
+
+    Attributes:
+        name: Identifier for this preset (e.g., "llama3.1", "dsv3")
+        num_q_heads: Number of query heads
+        num_kv_heads: Number of key/value heads (differs from num_q_heads for GQA)
+        head_dim: Head dimension (used if head_dim_qk/vo not specified)
+        head_dim_qk: Head dimension for Q/K tensors (optional, for asymmetric)
+        head_dim_vo: Head dimension for V/O tensors (optional, for asymmetric)
+
+    Example:
+        # Symmetric head dimensions (Llama 3.1)
+        LLAMA3_1 = ModelPreset(
+            name="llama3.1",
+            num_q_heads=64,
+            num_kv_heads=8,
+            head_dim=128,
+        )
+
+        # Asymmetric head dimensions (DeepSeek V3)
+        DSV3 = ModelPreset(
+            name="dsv3",
+            num_q_heads=128,
+            num_kv_heads=128,
+            head_dim_qk=192,
+            head_dim_vo=128,
+        )
+    """
+
+    name: str
+    num_q_heads: int
+    num_kv_heads: int
+    head_dim: int = 128
+    head_dim_qk: Optional[int] = None
+    head_dim_vo: Optional[int] = None
+
+    def __post_init__(self):
+        """Resolve head dimensions after initialization."""
+        if self.head_dim_qk is None:
+            self.head_dim_qk = self.head_dim
+        if self.head_dim_vo is None:
+            self.head_dim_vo = self.head_dim
+
+
+@dataclass
+class BenchmarkConfig:
+    """
+    Configuration for a benchmark suite.
+
+    Defines a set of benchmarks to run. The runner will expand this into
+    individual benchmark cases via cartesian product of:
+        models x seqlens x backends x data_types x attn_masks x deterministic_bwd
+
+    Attributes:
+        name: Identifier for this config (used in output filenames)
+        models: List of ModelPreset to benchmark
+        seqlens: List of (q_seqlen, kv_seqlen) tuples
+        backends: List of backend names (e.g., ["cudnn", "flash_attention_4"])
+        data_types: List of data types (e.g., ["bfloat16", "fp8"])
+        attn_masks: List of attention masks (e.g., ["top_left", "no_mask"])
+        profile_pass: Which pass to profile ("fwd", "bwd", or "both")
+        batch_size: Batch size for all benchmarks
+        num_iterations: Number of iterations per benchmark
+        num_warmup_iterations: Warmup iterations before measurement
+        skip_ref: Skip reference validation
+        deterministic_bwd: List of deterministic modes to test for backward pass
+        output_dir: Directory for output files
+
+    Example:
+        CONFIG = BenchmarkConfig(
+            name="my_benchmark",
+            models=[LLAMA3_1, DSV3],
+            seqlens=[(4096, 4096), (8192, 8192)],
+            backends=["cudnn", "flash_attention_4"],
+            data_types=["bfloat16", "fp8"],
+            attn_masks=["top_left", "no_mask"],
+            profile_pass="fwd",
+        )
+    """
+
+    name: str
+    models: List[ModelPreset]
+    seqlens: List[Tuple[int, int]]
+    backends: List[str] = field(default_factory=lambda: ["cudnn"])
+    data_types: List[str] = field(default_factory=lambda: ["bfloat16"])
+    attn_masks: List[str] = field(default_factory=lambda: ["top_left"])
+    profile_pass: str = "fwd"
+    batch_size: int = 1
+    num_iterations: int = 10
+    num_warmup_iterations: int = 0
+    skip_ref: bool = True
+    deterministic_bwd: List[bool] = field(default_factory=lambda: [False])
+    output_dir: str = "../results"
+
+
+@dataclass
+class BenchmarkResult:
+    """
+    Result from a single benchmark execution.
+
+    Contains both the configuration that was run and the measured results.
+
+    Attributes:
+        config_name: Name of the BenchmarkConfig this result belongs to
+        model_name: Name of the ModelPreset used
+        backend: Backend that was used
+        data_type: Data type that was used
+        attn_mask: Attention mask that was used
+        batch_size: Batch size
+        q_seqlen: Query sequence length
+        kv_seqlen: Key/value sequence length
+        num_q_heads: Number of query heads
+        num_kv_heads: Number of key/value heads
+        head_dim_qk: Head dimension for Q/K
+        head_dim_vo: Head dimension for V/O
+        profile_pass: Which pass was profiled
+        deterministic_bwd: Whether deterministic backward was used
+        fwd_time_ms: Forward pass time in milliseconds
+        bwd_time_ms: Backward pass time in milliseconds (0 if not run)
+        fwd_tflops: Forward pass throughput in TFLOPS
+        bwd_tflops: Backward pass throughput in TFLOPS
+        max_diff: Maximum difference vs reference (if validated)
+        num_iterations: Number of iterations run
+        success: Whether the benchmark completed successfully
+        error_message: Error message if benchmark failed
+        gpu_name: Name of the GPU used
+        cudnn_version: cuDNN version string
+    """
+
+    # Config identification
+    config_name: str
+    model_name: str
+    backend: str
+    data_type: str
+    attn_mask: str
+
+    # Dimensions
+    batch_size: int
+    q_seqlen: int
+    kv_seqlen: int
+    num_q_heads: int
+    num_kv_heads: int
+    head_dim_qk: int
+    head_dim_vo: int
+
+    # Execution options
+    profile_pass: str
+    deterministic_bwd: bool
+
+    # Results
+    fwd_time_ms: float
+    bwd_time_ms: float
+    fwd_tflops: float
+    bwd_tflops: float
+    max_diff: float
+    num_iterations: int
+
+    # Status
+    success: bool = True
+    error_message: Optional[str] = None
+
+    # Metadata
+    gpu_name: Optional[str] = None
+    cudnn_version: Optional[str] = None
+    cudnn_backend_version: Optional[int] = None
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/__init__.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/__init__.py
new file mode 100644
index 00000000..37bf2d67
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/__init__.py
@@ -0,0 +1,62 @@
+"""
+Benchmark configuration loading utilities.
+
+This module provides functions to load benchmark configurations by name.
+"""
+
+import importlib
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from ..config_types import BenchmarkConfig
+
+
+def load_config(name: str) -> "BenchmarkConfig":
+    """
+    Load a benchmark configuration by name.
+
+    Configurations are Python modules in the configs directory.
+    Each module should define a CONFIG variable of type BenchmarkConfig.
+
+    Args:
+        name: Name of the config (without .py extension)
+
+    Returns:
+        BenchmarkConfig instance
+
+    Raises:
+        ValueError: If config not found or doesn't define CONFIG
+
+    Example:
+        config = load_config("mlperf")
+        print(config.name)  # "mlperf"
+    """
+    try:
+        module = importlib.import_module(f".{name}", package=__package__)
+    except ModuleNotFoundError:
+        raise ValueError(f"Config '{name}' not found. " f"Create a file at configs/{name}.py with a CONFIG variable.")
+
+    if not hasattr(module, "CONFIG"):
+        raise ValueError(f"Config module '{name}' must define a CONFIG variable of type BenchmarkConfig")
+
+    return module.CONFIG
+
+
+def list_configs() -> list:
+    """
+    List available config names.
+
+    Returns:
+        List of config names (without .py extension)
+    """
+    import os
+    from pathlib import Path
+
+    configs_dir = Path(__file__).parent
+    configs = []
+
+    for f in configs_dir.iterdir():
+        if f.suffix == ".py" and f.stem != "__init__":
+            configs.append(f.stem)
+
+    return sorted(configs)
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/dsv3.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/dsv3.py
new file mode 100644
index 00000000..404842c7
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/dsv3.py
@@ -0,0 +1,41 @@
+"""
+DeepSeek V3 SDPA Benchmark Configuration
+
+Benchmarks DeepSeek V3-style MHA with asymmetric head dimensions.
+Only causal (top_left) mask - no non-causal benchmarks needed.
+Includes forward and backward pass benchmarking with deterministic mode options.
+
+Usage:
+    python -m benchmark.sdpa_benchmark_training.runner --config dsv3
+    python -m benchmark.sdpa_benchmark_training.runner --config dsv3 --dry-run
+"""
+
+from ..config_types import ModelPreset, BenchmarkConfig
+
+DSV3 = ModelPreset(
+    name="dsv3",
+    num_q_heads=128,
+    num_kv_heads=128,
+    head_dim_qk=192,
+    head_dim_vo=128,
+)
+
+CONFIG = BenchmarkConfig(
+    name="dsv3",
+    models=[DSV3],
+    seqlens=[
+        (32768, 32768),
+        (16384, 16384),
+        (8192, 8192),
+        (4096, 4096),
+        (2048, 2048),
+    ],
+    backends=["cudnn", "flash_attention_4"],
+    data_types=["bfloat16", "fp8"],
+    attn_masks=["top_left"],  # Causal only
+    profile_pass="both",  # Forward and backward
+    deterministic_bwd=[True],
+    batch_size=1,
+    num_iterations=10,
+    output_dir="results",
+)
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/llama.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/llama.py
new file mode 100644
index 00000000..803db837
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/configs/llama.py
@@ -0,0 +1,39 @@
+"""
+Llama 3.1 SDPA Benchmark Configuration
+
+Benchmarks Llama 3.1 405B-style GQA attention with both causal and non-causal masks.
+Includes forward and backward pass benchmarking with deterministic mode options.
+
+Usage:
+    python -m benchmark.sdpa_benchmark_training.runner --config llama
+    python -m benchmark.sdpa_benchmark_training.runner --config llama --dry-run
+"""
+
+from ..config_types import ModelPreset, BenchmarkConfig
+
+LLAMA3_1 = ModelPreset(
+    name="llama3.1",
+    num_q_heads=64,
+    num_kv_heads=8,
+    head_dim=128,
+)
+
+CONFIG = BenchmarkConfig(
+    name="llama3.1",
+    models=[LLAMA3_1],
+    seqlens=[
+        (32768, 32768),
+        (16384, 16384),
+        (8192, 8192),
+        (4096, 4096),
+        (2048, 2048),
+    ],
+    backends=["cudnn", "flash_attention_4"],
+    data_types=["bfloat16", "fp8"],
+    attn_masks=["top_left", "no_mask"],  # Both causal and non-causal
+    profile_pass="both",  # Forward and backward
+    deterministic_bwd=[False],
+    batch_size=1,
+    num_iterations=10,
+    output_dir="results",
+)
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_20260126_110621.csv b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_20260126_110621.csv
new file mode 100644
index 00000000..cabd5912
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_20260126_110621.csv
@@ -0,0 +1,41 @@
+config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,success,error_message,gpu_name,cudnn_version,cudnn_backend_version
+dsv3,dsv3,cudnn,bfloat16,top_left,1,32768,32768,128,128,192,128,both,True,24.538,87.230,1792.000,1311.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,32768,32768,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,16384,16384,128,128,192,128,both,True,6.476,22.025,1698.000,1298.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,16384,16384,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,8192,8192,128,128,192,128,both,True,1.831,5.875,1501.000,1217.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,8192,8192,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,4096,4096,128,128,192,128,both,True,0.519,1.650,1324.000,1083.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,4096,4096,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,2048,2048,128,128,192,128,both,True,0.163,0.520,1053.000,859.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,2048,2048,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_top_left_causal.png b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_top_left_causal.png
new file mode 100644
index 00000000..a79e4809
Binary files /dev/null and b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/dsv3_top_left_causal.png differ
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_20260126_110503.csv b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_20260126_110503.csv
new file mode 100644
index 00000000..11f9fa9f
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_20260126_110503.csv
@@ -0,0 +1,21 @@
+config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,success,error_message,gpu_name,cudnn_version,cudnn_backend_version
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,32768,32768,64,8,128,128,both,False,10.436,30.513,1686.000,1441.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,32768,32768,64,8,128,128,both,False,20.041,59.879,1756.000,1469.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,32768,32768,64,8,128,128,both,False,8.317,25.675,2115.000,1713.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,32768,32768,64,8,128,128,both,False,16.521,49.482,2130.000,1778.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,16384,16384,64,8,128,128,both,False,2.672,8.018,1646.000,1371.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,16384,16384,64,8,128,128,both,False,5.037,15.384,1746.000,1429.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,16384,16384,64,8,128,128,both,False,2.182,6.730,2016.000,1634.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,16384,16384,64,8,128,128,both,False,4.240,12.707,2075.000,1731.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,8192,8192,64,8,128,128,both,False,0.704,2.150,1563.000,1279.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,8192,8192,64,8,128,128,both,False,1.313,3.980,1675.000,1381.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,8192,8192,64,8,128,128,both,False,0.591,1.851,1862.000,1485.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,8192,8192,64,8,128,128,both,False,1.133,3.385,1941.000,1624.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,4096,4096,64,8,128,128,both,False,0.212,0.622,1297.000,1105.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,4096,4096,64,8,128,128,both,False,0.350,1.090,1569.000,1261.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,4096,4096,64,8,128,128,both,False,0.172,0.555,1602.000,1239.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,4096,4096,64,8,128,128,both,False,0.299,0.941,1841.000,1461.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,2048,2048,64,8,128,128,both,False,0.067,0.209,1022.000,824.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,2048,2048,64,8,128,128,both,False,0.112,0.321,1232.000,1070.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,2048,2048,64,8,128,128,both,False,0.057,0.190,1215.000,905.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,2048,2048,64,8,128,128,both,False,0.090,0.284,1521.000,1210.000,0.000,10,True,,NVIDIA GB200,1.18.0,91801
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_no_mask.png b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_no_mask.png
new file mode 100644
index 00000000..ac4bced3
Binary files /dev/null and b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_no_mask.png differ
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_top_left_causal.png b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_top_left_causal.png
new file mode 100644
index 00000000..f5f3a306
Binary files /dev/null and b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb200_918_only_cudnn/llama3.1_top_left_causal.png differ
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_20260126_110622.csv b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_20260126_110622.csv
new file mode 100644
index 00000000..9dac38b4
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_20260126_110622.csv
@@ -0,0 +1,41 @@
+config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,success,error_message,gpu_name,cudnn_version,cudnn_backend_version
+dsv3,dsv3,cudnn,bfloat16,top_left,1,32768,32768,128,128,192,128,both,True,21.319,80.520,2063.000,1420.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,32768,32768,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,16384,16384,128,128,192,128,both,True,5.584,20.381,1969.000,1403.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,16384,16384,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,8192,8192,128,128,192,128,both,True,1.518,5.412,1811.000,1321.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,8192,8192,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,4096,4096,128,128,192,128,both,True,0.438,1.541,1570.000,1160.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,4096,4096,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
+dsv3,dsv3,cudnn,bfloat16,top_left,1,2048,2048,128,128,192,128,both,True,0.148,0.493,1158.000,906.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801.000
+dsv3,dsv3,cudnn,fp8,top_left,1,2048,2048,128,128,192,128,both,True,inf,inf,0.000,0.000,0.000,10,False,"Benchmark failed with return code 1.
+stderr: Traceback (most recent call last):
+  File ""/workspace/cudnn_frontend/benchmark/sdpa_benchmark_training/benchmark_single_sdpa.py"", line 560, in <module>
+    graph_fwd.validate()
+cudnn._compiled_module.cudnnGraphNotSupportedError: hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)
+
+stdout: ",,,
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_top_left_causal.png b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_top_left_causal.png
new file mode 100644
index 00000000..39cdbe44
Binary files /dev/null and b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/dsv3_top_left_causal.png differ
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_20260126_110426.csv b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_20260126_110426.csv
new file mode 100644
index 00000000..433b02ce
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_20260126_110426.csv
@@ -0,0 +1,21 @@
+config_name,model_name,backend,data_type,attn_mask,batch_size,q_seqlen,kv_seqlen,num_q_heads,num_kv_heads,head_dim_qk,head_dim_vo,profile_pass,deterministic_bwd,fwd_time_ms,bwd_time_ms,fwd_tflops,bwd_tflops,max_diff,num_iterations,success,error_message,gpu_name,cudnn_version,cudnn_backend_version
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,32768,32768,64,8,128,128,both,False,8.663,28.331,2031.000,1552.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,32768,32768,64,8,128,128,both,False,17.400,56.680,2022.000,1552.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,32768,32768,64,8,128,128,both,False,5.942,23.707,2961.000,1855.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,32768,32768,64,8,128,128,both,False,11.782,45.618,2986.000,1928.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,16384,16384,64,8,128,128,both,False,2.202,7.361,1998.000,1494.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,16384,16384,64,8,128,128,both,False,4.396,14.124,2001.000,1557.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,16384,16384,64,8,128,128,both,False,1.577,6.233,2789.000,1764.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,16384,16384,64,8,128,128,both,False,3.025,11.772,2907.000,1868.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,8192,8192,64,8,128,128,both,False,0.571,1.976,1927.000,1391.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,8192,8192,64,8,128,128,both,False,1.118,3.670,1967.000,1498.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,8192,8192,64,8,128,128,both,False,0.434,1.728,2534.000,1591.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,8192,8192,64,8,128,128,both,False,0.807,3.154,2724.000,1743.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,4096,4096,64,8,128,128,both,False,0.164,0.574,1679.000,1198.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,4096,4096,64,8,128,128,both,False,0.289,1.016,1901.000,1352.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,4096,4096,64,8,128,128,both,False,0.129,0.527,2136.000,1305.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,4096,4096,64,8,128,128,both,False,0.213,0.884,2580.000,1554.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,top_left,1,2048,2048,64,8,128,128,both,False,0.054,0.191,1265.000,900.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,bfloat16,no_mask,1,2048,2048,64,8,128,128,both,False,0.088,0.299,1559.000,1151.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,top_left,1,2048,2048,64,8,128,128,both,False,0.044,0.181,1574.000,947.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
+llama3.1,llama3.1,cudnn,fp8,no_mask,1,2048,2048,64,8,128,128,both,False,0.066,0.275,2086.000,1251.000,0.000,10,True,,NVIDIA GB300,1.18.0,91801
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_no_mask.png b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_no_mask.png
new file mode 100644
index 00000000..312bab6f
Binary files /dev/null and b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_no_mask.png differ
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_top_left_causal.png b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_top_left_causal.png
new file mode 100644
index 00000000..dc6fd4d4
Binary files /dev/null and b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/results/gb300_918_only_cudnn/llama3.1_top_left_causal.png differ
diff --git a/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/runner.py b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/runner.py
new file mode 100644
index 00000000..e1201eae
--- /dev/null
+++ b/third_party/cudnn-frontend/benchmark/sdpa_benchmark_training/runner.py
@@ -0,0 +1,505 @@
+"""
+Benchmark runner with configuration expansion, execution, and result collection.
+
+This module provides the BenchmarkRunner class for running SDPA benchmarks
+from configuration files, and a CLI entry point.
+
+Usage:
+    # Run from command line
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf --dry-run
+
+    # Import and use programmatically
+    from benchmark.sdpa_benchmark_training.runner import BenchmarkRunner
+    from benchmark.sdpa_benchmark_training.configs import load_config
+
+    config = load_config("mlperf")
+    runner = BenchmarkRunner()
+    results = runner.run_config(config)
+    runner.save_csv(results, config)
+"""
+
+import itertools
+import logging
+import sys
+from dataclasses import asdict
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Iterator, List, Optional
+
+from .config_types import BenchmarkConfig, BenchmarkResult, ModelPreset
+
+logger = logging.getLogger(__name__)
+
+
+def log_environment_info():
+    """Log environment information (torch, CUDA, cuDNN, flash_attn versions)."""
+    try:
+        import torch
+
+        logger.info(f"torch.__version__ = '{torch.__version__}'")
+        logger.info(f"torch.version.cuda = '{torch.version.cuda}'")
+        logger.info(f"torch.cuda.is_available() = {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            logger.info(f"torch.cuda.device_count() = {torch.cuda.device_count()}")
+            logger.info(f"torch.cuda.current_device() = {torch.cuda.current_device()}")
+            logger.info(f"torch.cuda.get_device_name(torch.cuda.current_device()) = '{torch.cuda.get_device_name(torch.cuda.current_device())}'")
+        logger.info(f"torch.backends.cudnn.enabled = {torch.backends.cudnn.enabled}")
+    except ImportError:
+        logger.warning("torch not available")
+
+    try:
+        import cudnn
+
+        logger.info(f"cuDNN Backend Version: cudnn.backend_version() = {cudnn.backend_version()}")
+        logger.info(f"cuDNN Frontend Version: cudnn.__version__ = '{cudnn.__version__}'")
+    except ImportError:
+        logger.warning("cudnn not available")
+
+    try:
+        import flash_attn
+
+        logger.info(f"flash_attn.__version__ = '{flash_attn.__version__}'")
+    except ImportError:
+        pass  # flash_attn is optional
+
+
+class BenchmarkRunner:
+    """
+    Runs benchmarks from configurations with cartesian product expansion.
+
+    The runner takes a BenchmarkConfig and expands it into individual benchmark
+    cases via cartesian product of all configuration dimensions. Each case is
+    then executed and results are collected.
+
+    Attributes:
+        verbose: Whether to print progress information
+
+    Example:
+        runner = BenchmarkRunner(verbose=True)
+        config = load_config("mlperf")
+
+        # Dry run to see what would be executed
+        for case in runner.expand_config(config):
+            print(case)
+
+        # Actually run the benchmarks
+        results = runner.run_config(config)
+        runner.save_csv(results, config)
+    """
+
+    def __init__(self, verbose: bool = True):
+        """
+        Initialize the runner.
+
+        Args:
+            verbose: Whether to print progress information
+        """
+        self.verbose = verbose
+        self._setup_logging()
+
+    def _setup_logging(self):
+        """Configure logging based on verbosity setting."""
+        level = logging.INFO if self.verbose else logging.WARNING
+        logging.basicConfig(
+            level=level,
+            format="[%(levelname)s] %(message)s",
+            stream=sys.stderr,
+        )
+
+    def expand_config(self, config: BenchmarkConfig) -> Iterator[Dict[str, Any]]:
+        """
+        Expand a BenchmarkConfig into individual benchmark cases.
+
+        Performs cartesian product expansion over:
+            models x seqlens x backends x data_types x attn_masks x deterministic_bwd
+
+        Args:
+            config: BenchmarkConfig to expand
+
+        Yields:
+            Dict containing all parameters for a single benchmark run
+        """
+        for model, (q_seqlen, kv_seqlen), backend, data_type, attn_mask, det_bwd in itertools.product(
+            config.models,
+            config.seqlens,
+            config.backends,
+            config.data_types,
+            config.attn_masks,
+            config.deterministic_bwd,
+        ):
+            # Skip deterministic mode for forward-only runs
+            if det_bwd and config.profile_pass == "fwd":
+                continue
+
+            yield {
+                "config_name": config.name,
+                "model": model,
+                "q_seqlen": q_seqlen,
+                "kv_seqlen": kv_seqlen,
+                "backend": backend,
+                "data_type": data_type,
+                "attn_mask": attn_mask,
+                "profile_pass": config.profile_pass,
+                "batch_size": config.batch_size,
+                "num_iterations": config.num_iterations,
+                "num_warmup_iterations": config.num_warmup_iterations,
+                "skip_ref": config.skip_ref,
+                "deterministic_bwd": det_bwd,
+            }
+
+    def run_single(self, case: Dict[str, Any]) -> BenchmarkResult:
+        """
+        Run a single benchmark case.
+
+        Calls the run_benchmark() function from benchmark_single_sdpa.py
+        and wraps the result in a BenchmarkResult.
+
+        Args:
+            case: Dict containing benchmark parameters (from expand_config)
+
+        Returns:
+            BenchmarkResult with timing data or error information
+        """
+        model: ModelPreset = case["model"]
+
+        try:
+            # Import here to avoid circular imports and allow the module to be
+            # used even if torch/cudnn aren't installed (for dry-run mode)
+            from .benchmark_single_sdpa import run_benchmark
+
+            result = run_benchmark(
+                batch_size=case["batch_size"],
+                q_seqlen=case["q_seqlen"],
+                kv_seqlen=case["kv_seqlen"],
+                num_q_heads=model.num_q_heads,
+                num_kv_heads=model.num_kv_heads,
+                head_dim_qk=model.head_dim_qk,
+                head_dim_vo=model.head_dim_vo,
+                data_type=case["data_type"],
+                backend=case["backend"],
+                attn_mask=case["attn_mask"],
+                profile_pass=case["profile_pass"],
+                num_iterations=case["num_iterations"],
+                num_warmup_iterations=case["num_warmup_iterations"],
+                skip_ref=case["skip_ref"],
+                deterministic_bwd=case["deterministic_bwd"],
+            )
+
+            return BenchmarkResult(
+                config_name=case["config_name"],
+                model_name=model.name,
+                backend=case["backend"],
+                data_type=case["data_type"],
+                attn_mask=case["attn_mask"],
+                batch_size=case["batch_size"],
+                q_seqlen=case["q_seqlen"],
+                kv_seqlen=case["kv_seqlen"],
+                num_q_heads=model.num_q_heads,
+                num_kv_heads=model.num_kv_heads,
+                head_dim_qk=model.head_dim_qk,
+                head_dim_vo=model.head_dim_vo,
+                profile_pass=case["profile_pass"],
+                deterministic_bwd=case["deterministic_bwd"],
+                fwd_time_ms=result["fwd_time_ms"],
+                bwd_time_ms=result["bwd_time_ms"],
+                fwd_tflops=result["fwd_tflops"],
+                bwd_tflops=result["bwd_tflops"],
+                max_diff=result["max_diff"],
+                num_iterations=case["num_iterations"],
+                success=True,
+                gpu_name=result.get("gpu_name"),
+                cudnn_version=result.get("cudnn_version"),
+                cudnn_backend_version=result.get("cudnn_backend_version"),
+            )
+
+        except Exception as e:
+            logger.error(f"Benchmark failed: {e}")
+            return BenchmarkResult(
+                config_name=case["config_name"],
+                model_name=model.name,
+                backend=case["backend"],
+                data_type=case["data_type"],
+                attn_mask=case["attn_mask"],
+                batch_size=case["batch_size"],
+                q_seqlen=case["q_seqlen"],
+                kv_seqlen=case["kv_seqlen"],
+                num_q_heads=model.num_q_heads,
+                num_kv_heads=model.num_kv_heads,
+                head_dim_qk=model.head_dim_qk,
+                head_dim_vo=model.head_dim_vo,
+                profile_pass=case["profile_pass"],
+                deterministic_bwd=case["deterministic_bwd"],
+                fwd_time_ms=float("inf"),
+                bwd_time_ms=float("inf"),
+                fwd_tflops=0.0,
+                bwd_tflops=0.0,
+                max_diff=0.0,
+                num_iterations=case["num_iterations"],
+                success=False,
+                error_message=str(e),
+            )
+
+    def run_config(
+        self,
+        config: BenchmarkConfig,
+        filter_model: Optional[str] = None,
+        filter_backend: Optional[str] = None,
+        filter_dtype: Optional[str] = None,
+    ) -> List[BenchmarkResult]:
+        """
+        Run all benchmarks from a configuration.
+
+        Args:
+            config: BenchmarkConfig to run
+            filter_model: Optional model name filter (substring match)
+            filter_backend: Optional backend filter (exact match)
+            filter_dtype: Optional data type filter (exact match)
+
+        Returns:
+            List of BenchmarkResult for all executed cases
+        """
+        # Log environment info at the start
+        log_environment_info()
+        logger.info("")  # Blank line for readability
+
+        results = []
+        cases = list(self.expand_config(config))
+
+        # Apply filters
+        if filter_model:
+            cases = [c for c in cases if filter_model in c["model"].name]
+        if filter_backend:
+            cases = [c for c in cases if c["backend"] == filter_backend]
+        if filter_dtype:
+            cases = [c for c in cases if c["data_type"] == filter_dtype]
+
+        if not cases:
+            logger.warning("No benchmark cases to run after applying filters")
+            return results
+
+        logger.info(f"Running {len(cases)} benchmark cases from config '{config.name}'")
+
+        for i, case in enumerate(cases, 1):
+            model = case["model"]
+            det_str = "det" if case["deterministic_bwd"] else "non-det"
+            logger.info(
+                f"[{i}/{len(cases)}] {model.name} | "
+                f"seq={case['q_seqlen']}x{case['kv_seqlen']} | "
+                f"{case['backend']} | {case['data_type']} | "
+                f"{case['attn_mask']} | {det_str}"
+            )
+
+            result = self.run_single(case)
+            results.append(result)
+
+            if result.success:
+                fwd_info = f"fwd: {result.fwd_time_ms:.3f}ms ({result.fwd_tflops:.0f} TFLOPS)"
+                bwd_info = f"bwd: {result.bwd_time_ms:.3f}ms ({result.bwd_tflops:.0f} TFLOPS)"
+                logger.info(f"  -> {fwd_info}, {bwd_info}")
+            else:
+                logger.warning(f"  -> FAILED: {result.error_message}")
+
+        return results
+
+    def results_to_dataframe(self, results: List[BenchmarkResult]):
+        """
+        Convert results to a pandas DataFrame.
+
+        Args:
+            results: List of BenchmarkResult
+
+        Returns:
+            pandas DataFrame with all result fields as columns
+        """
+        import pandas as pd
+
+        return pd.DataFrame([asdict(r) for r in results])
+
+    def save_csv(
+        self,
+        results: List[BenchmarkResult],
+        config: BenchmarkConfig,
+        output_path: Optional[Path] = None,
+    ) -> Path:
+        """
+        Save results to a CSV file.
+
+        Args:
+            results: List of BenchmarkResult
+            config: BenchmarkConfig (used for default filename)
+            output_path: Optional explicit output path
+
+        Returns:
+            Path to the saved CSV file
+        """
+        import pandas as pd
+
+        df = self.results_to_dataframe(results)
+
+        if output_path is None:
+            output_dir = Path(config.output_dir)
+            output_dir.mkdir(parents=True, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_path = output_dir / f"{config.name}_{timestamp}.csv"
+
+        df.to_csv(output_path, index=False, float_format="%.3f")
+        logger.info(f"Results saved to {output_path}")
+
+        return output_path
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Run SDPA benchmarks from configuration files",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Run all benchmarks from mlperf config
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf
+
+    # Dry run (show what would be executed)
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf --dry-run
+
+    # Filter by model name
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf --filter llama3.1
+
+    # Filter by backend
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf --backend cudnn
+
+    # Skip chart generation
+    python -m benchmark.sdpa_benchmark_training.runner --config mlperf --no-chart
+        """,
+    )
+
+    parser.add_argument(
+        "--config",
+        required=True,
+        help="Config name (e.g., 'mlperf'). Must be a Python file in configs/",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print benchmark cases without executing",
+    )
+    parser.add_argument(
+        "--filter",
+        dest="filter_model",
+        help="Filter by model name (substring match)",
+    )
+    parser.add_argument(
+        "--backend",
+        dest="filter_backend",
+        help="Filter by backend (exact match)",
+    )
+    parser.add_argument(
+        "--dtype",
+        dest="filter_dtype",
+        help="Filter by data type (exact match)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        help="Output path for CSV (default: artifacts/<config>_<timestamp>.csv)",
+    )
+    parser.add_argument(
+        "--no-chart",
+        action="store_true",
+        help="Skip chart generation",
+    )
+    parser.add_argument(
+        "--list-configs",
+        action="store_true",
+        help="List available configurations and exit",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Reduce output verbosity",
+    )
+
+    args = parser.parse_args()
+
+    # Handle --list-configs
+    if args.list_configs:
+        from .configs import list_configs
+
+        configs = list_configs()
+        print("Available configurations:")
+        for name in configs:
+            print(f"  {name}")
+        return
+
+    # Load config
+    from .configs import load_config
+
+    try:
+        config = load_config(args.config)
+    except ValueError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    runner = BenchmarkRunner(verbose=not args.quiet)
+
+    # Dry run mode
+    if args.dry_run:
+        cases = list(runner.expand_config(config))
+
+        # Apply filters for display
+        if args.filter_model:
+            cases = [c for c in cases if args.filter_model in c["model"].name]
+        if args.filter_backend:
+            cases = [c for c in cases if c["backend"] == args.filter_backend]
+        if args.filter_dtype:
+            cases = [c for c in cases if c["data_type"] == args.filter_dtype]
+
+        print(f"Would run {len(cases)} benchmark cases from config '{config.name}':")
+        print()
+        for i, case in enumerate(cases, 1):
+            model = case["model"]
+            det_str = "det" if case["deterministic_bwd"] else "non-det"
+            print(
+                f"  [{i}] {model.name} | "
+                f"seq={case['q_seqlen']}x{case['kv_seqlen']} | "
+                f"{case['backend']} | {case['data_type']} | "
+                f"{case['attn_mask']} | {det_str}"
+            )
+        return
+
+    # Run benchmarks
+    results = runner.run_config(
+        config,
+        filter_model=args.filter_model,
+        filter_backend=args.filter_backend,
+        filter_dtype=args.filter_dtype,
+    )
+
+    if not results:
+        print("No results to save", file=sys.stderr)
+        sys.exit(1)
+
+    # Save CSV
+    csv_path = runner.save_csv(results, config, args.output)
+
+    # Generate charts (separate chart per mask type for clarity)
+    if not args.no_chart:
+        try:
+            from .charts import generate_charts_by_mask
+
+            df = runner.results_to_dataframe(results)
+            chart_paths = generate_charts_by_mask(df, config)
+            for path in chart_paths:
+                print(f"Chart saved to {path}")
+        except ImportError as e:
+            logger.warning(f"Could not generate chart (missing dependency): {e}")
+        except Exception as e:
+            logger.warning(f"Could not generate chart: {e}")
+
+    print(f"Results saved to {csv_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/cudnn-frontend/cmake/cuDNN.cmake b/third_party/cudnn-frontend/cmake/cuDNN.cmake
new file mode 100644
index 00000000..0ab86363
--- /dev/null
+++ b/third_party/cudnn-frontend/cmake/cuDNN.cmake
@@ -0,0 +1,115 @@
+add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
+
+find_path(
+    CUDNN_INCLUDE_DIR cudnn.h
+    HINTS $ENV{CUDNN_INCLUDE_PATH} ${CUDNN_INCLUDE_PATH} $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
+    PATH_SUFFIXES include
+    REQUIRED
+)
+
+file(READ "${CUDNN_INCLUDE_DIR}/cudnn_version.h" cudnn_version_header)
+string(REGEX MATCH "#define CUDNN_MAJOR [1-9]+" macrodef "${cudnn_version_header}")
+string(REGEX MATCH "[1-9]+" CUDNN_MAJOR_VERSION "${macrodef}")
+
+function(find_cudnn_library NAME)
+    if(NOT "${ARGV1}" STREQUAL "OPTIONAL")
+        set(_cudnn_required "REQUIRED")
+    else()
+        set(_cudnn_required "")
+    endif()
+
+    find_library(
+        ${NAME}_LIBRARY
+        NAMES ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
+        NAMES_PER_DIR
+        HINTS $ENV{CUDNN_LIBRARY_PATH} ${CUDNN_LIBRARY_PATH} $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
+        PATH_SUFFIXES lib64 lib/x64 lib
+        ${_cudnn_required}
+    )
+    
+    if(${NAME}_LIBRARY)
+        add_library(CUDNN::${NAME} UNKNOWN IMPORTED)
+        set_target_properties(
+            CUDNN::${NAME} PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
+            IMPORTED_LOCATION ${${NAME}_LIBRARY}
+        )
+        message(STATUS "${NAME} found at ${${NAME}_LIBRARY}.")
+    else()
+        message(STATUS "${NAME} not found.")
+    endif()
+endfunction()
+
+find_cudnn_library(cudnn)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    LIBRARY REQUIRED_VARS
+    CUDNN_INCLUDE_DIR cudnn_LIBRARY
+)
+
+if(CUDNN_INCLUDE_DIR AND cudnn_LIBRARY)
+
+    message(STATUS "cuDNN: ${cudnn_LIBRARY}")
+    message(STATUS "cuDNN: ${CUDNN_INCLUDE_DIR}")
+    
+    set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found")
+
+else()
+
+    set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Not Found")
+
+endif()
+
+target_include_directories(
+    CUDNN::cudnn_all
+    INTERFACE
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CUDNN_INCLUDE_DIR}>
+)
+
+target_link_libraries(
+    CUDNN::cudnn_all
+    INTERFACE
+    CUDNN::cudnn 
+)
+
+if(CUDNN_MAJOR_VERSION EQUAL 8)
+    find_cudnn_library(cudnn_adv_infer)
+    find_cudnn_library(cudnn_adv_train)
+    find_cudnn_library(cudnn_cnn_infer)
+    find_cudnn_library(cudnn_cnn_train)
+    find_cudnn_library(cudnn_ops_infer)
+    find_cudnn_library(cudnn_ops_train)
+
+    target_link_libraries(
+        CUDNN::cudnn_all
+        INTERFACE
+        CUDNN::cudnn_adv_train
+        CUDNN::cudnn_ops_train
+        CUDNN::cudnn_cnn_train
+        CUDNN::cudnn_adv_infer
+        CUDNN::cudnn_cnn_infer
+        CUDNN::cudnn_ops_infer
+    )
+elseif(CUDNN_MAJOR_VERSION EQUAL 9)
+    find_cudnn_library(cudnn_graph)
+    find_cudnn_library(cudnn_engines_runtime_compiled)
+    find_cudnn_library(cudnn_ops OPTIONAL)
+    find_cudnn_library(cudnn_cnn OPTIONAL)
+    find_cudnn_library(cudnn_adv OPTIONAL)
+    find_cudnn_library(cudnn_engines_precompiled OPTIONAL)
+    find_cudnn_library(cudnn_heuristic OPTIONAL)
+
+    target_link_libraries(
+        CUDNN::cudnn_all
+        INTERFACE
+        CUDNN::cudnn_graph
+        CUDNN::cudnn_engines_runtime_compiled
+        CUDNN::cudnn_ops
+        CUDNN::cudnn_cnn
+        CUDNN::cudnn_adv
+        CUDNN::cudnn_engines_precompiled
+        CUDNN::cudnn_heuristic
+    )
+endif()
diff --git a/third_party/cudnn-frontend/cudnn_frontend-config.cmake.in b/third_party/cudnn-frontend/cudnn_frontend-config.cmake.in
new file mode 100644
index 00000000..8b2d8430
--- /dev/null
+++ b/third_party/cudnn-frontend/cudnn_frontend-config.cmake.in
@@ -0,0 +1,3 @@
+@PACKAGE_INIT@
+
+include(${CMAKE_CURRENT_LIST_DIR}/cudnn_frontend-targets.cmake)
diff --git a/third_party/cudnn-frontend/dlpack_version.txt b/third_party/cudnn-frontend/dlpack_version.txt
new file mode 100644
index 00000000..9459d4ba
--- /dev/null
+++ b/third_party/cudnn-frontend/dlpack_version.txt
@@ -0,0 +1 @@
+1.1
diff --git a/third_party/cudnn-frontend/include/cudnn_backend_base.h b/third_party/cudnn-frontend/include/cudnn_backend_base.h
new file mode 100644
index 00000000..bae2de8a
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_backend_base.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <ostream>
+
+namespace cudnn_frontend {
+
+///
+/// OpaqueBackendPointer class
+/// Holds the raws pointer to backend_descriptor
+/// Usage is to wrap this into a smart pointer as
+/// it helps to create and destroy the backendpointer
+
+class OpaqueBackendPointer {
+    cudnnBackendDescriptor_t m_desc = nullptr;               //!< Raw void pointer
+    cudnnStatus_t status            = CUDNN_STATUS_SUCCESS;  //!< status of creation of the Descriptor
+
+   public:
+    OpaqueBackendPointer(const OpaqueBackendPointer&) = delete;  //!< Delete the copy constructor to prevent bad copies
+    OpaqueBackendPointer&
+    operator=(const OpaqueBackendPointer&)       = delete;
+    OpaqueBackendPointer(OpaqueBackendPointer&&) = default;
+
+    /**
+     * OpaqueBackendPointer constructor.
+     * Calls the cudnnBackendCreateDescriptor. Allocates memory according to the type.
+     */
+    OpaqueBackendPointer(cudnnBackendDescriptorType_t type) { status = detail::create_descriptor(type, &m_desc); }
+    /**
+     * OpaqueBackendPointer destructor.
+     * Calls the cudnnBackendDestroyDescriptor. Frees memory allocated in the constructor.
+     */
+    ~OpaqueBackendPointer() { detail::destroy_descriptor(m_desc); };
+    /**
+     * Accessor.
+     * Returns the const reference to raw underlying descriptor.
+     * Treat it like the data() function of a smart pointer. Can be freed behind the back.
+     */
+    cudnnBackendDescriptor_t const&
+    get_backend_descriptor() const {
+        return m_desc;
+    }
+    /**
+     * Accessor.
+     * Queries the status of the descriptor after calling the cudnnCreate.
+     */
+    cudnnStatus_t
+    get_status() const {
+        return status;
+    }
+    /**
+     * Accessor.
+     * Queries the status of the descriptor returns true if all good.
+     */
+    bool
+    is_good() const {
+        return status == CUDNN_STATUS_SUCCESS;
+    }
+};
+
+/*! \var A shared_ptr wrapper on top of the OpaqueBackendPointer */
+using ManagedOpaqueDescriptor = std::shared_ptr<OpaqueBackendPointer>;
+
+/*! \fn A wrapper on top of the std::make_shared for the OpaqueBackendPointer */
+static ManagedOpaqueDescriptor
+make_shared_backend_pointer(cudnnBackendDescriptorType_t type) {
+    return std::make_shared<OpaqueBackendPointer>(type);
+}
+
+///
+/// BackendDescriptor class
+/// Holds a Managed pointer to OpaqueBackendPointer class
+/// Contains the status and error message if set after any operation.
+/// If exception is disabled the user must query the status after
+/// build operation in order to check if the cudnn construct was built
+/// correctly.
+class BackendDescriptor {
+   public:
+    //! Return a string describing the backend Descriptor
+    virtual std::string
+    describe() const = 0;
+
+    //! Get a copy of the raw descriptor pointer. Ownership is reatined and
+    //! gets deleted when out of scope
+    cudnnBackendDescriptor_t
+    get_raw_desc() const {
+        return pointer->get_backend_descriptor();
+    }
+
+    //! Current status of the descriptor
+    cudnnStatus_t
+    get_status() const {
+        return status;
+    }
+
+    //! Set status of the descriptor
+    void
+    set_status(cudnnStatus_t const status_) const {
+        status = status_;
+    }
+
+    //! Set Diagonistic error message.
+    void
+    set_error(const char* message) const {
+        err_msg = message;
+    }
+
+    //! Diagonistic error message if any
+    const char*
+    get_error() const {
+        return err_msg.c_str();
+    }
+
+    //! Returns a copy of underlying managed descriptor
+    ManagedOpaqueDescriptor
+    get_desc() const {
+        return pointer;
+    }
+
+    //! Initializes the underlying managed descriptor
+    cudnnStatus_t
+    initialize_managed_backend_pointer(cudnnBackendDescriptorType_t type) {
+        pointer = make_shared_backend_pointer(type);
+        return pointer->get_status();
+    }
+
+   protected:
+    /**
+     * BackendDescriptor constructor.
+     * Initializes the member variables as passed.
+     */
+    BackendDescriptor(ManagedOpaqueDescriptor pointer_, cudnnStatus_t status_, std::string err_msg_)
+        : pointer(pointer_), status(status_), err_msg(err_msg_) {}
+    BackendDescriptor() = default;
+
+    virtual ~BackendDescriptor() {};
+
+    ManagedOpaqueDescriptor pointer;  //! Shared pointer of the OpaqueBackendPointer
+
+    mutable cudnnStatus_t status = CUDNN_STATUS_SUCCESS;  //!< Error code if any being set
+    mutable std::string err_msg;                          //!< Error message if any being set
+};
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend.h b/third_party/cudnn-frontend/include/cudnn_frontend.h
new file mode 100644
index 00000000..fe1a3500
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+// Suppress MSVC warning C4756 (overflow in constant arithmetic) that occurs
+// in MSVC's <optional> header with certain compiler versions
+#ifdef _MSC_VER
+#pragma warning(disable : 4756)
+#endif
+
+/*! \mainpage CUDNN FRONTEND API
+ *
+ * \section Introduction
+ *
+ * The cuDNN Frontend API is a C++ header-only library that demonstrates how to use the cuDNN C backend API. The cuDNN C
+ * backend API is documented in the cuDNN developer guide.
+ *
+ * \section Why use Frontend API
+ *
+ * Consider the following code snippet which showcases cudnnBackendTensor creation using the backend API and its
+ * equivalent front-end API code. Many among the backend constructs follow similar pattern.
+ *
+ *  ~~~~~~~~~~~~~~~{.cpp}
+ *
+ *  ===========================================================================================
+ *  auto check_status = [](cudnnStatus_t status) { assert (status == CUDNN_STATUS_SUCCESS); };
+ *  ===========================================================================================
+ *  // Backend code for Tensor Creation.
+ *  cudnnBackendDescriptor_t tensor;
+ *
+ *  check_status (cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor));
+ *
+ *  check_status (cudnnBackendSetAttribute(tensor,
+ *                                         CUDNN_ATTR_TENSOR_DATA_TYPE,
+ *                                         CUDNN_TYPE_DATA_TYPE,
+ *                                         1,
+ *                                         &data_type));
+ *  check_status (cudnnBackendSetAttribute(tensor,
+ *                                         CUDNN_ATTR_TENSOR_DIMENSIONS,
+ *                                         CUDNN_TYPE_INT64,
+ *                                         tensor_dim.size(),
+ *                                         tensor_dim.data()));
+ *  check_status (cudnnBackendSetAttribute(tensor,
+ *                                         CUDNN_ATTR_TENSOR_STRIDES,
+ *                                         CUDNN_TYPE_INT64,
+ *                                         tensor_str.size(),
+ *                                         tensor_str.data()));
+ *  check_status (cudnnBackendSetAttribute(tensor,
+ *                                         CUDNN_ATTR_TENSOR_UNIQUE_ID,
+ *                                         CUDNN_TYPE_INT64,
+ *                                         1,
+ *                                         &id));
+ *  check_status (cudnnBackendSetAttribute(tensor,
+ *                                         CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
+ *                                         CUDNN_TYPE_INT64,
+ *                                         1,
+ *                                         &alignment));
+ *  check_status (cudnnBackendFinalize(tensor));
+ *
+ *  check_status (cudnnBackendDestroyDescriptor(tensor));
+ *  ===========================================================================================
+ *  // FrontEnd equivalent code.
+ *  auto tensor =  cudnn_frontend::TensorBuilder()
+ *                     .setDim(tensor_dim.size(), tensor_dim.data())
+ *                     .setStrides(tensor_str.size(), tensor_str.data())
+ *                     .setId(id)
+ *                     .setAlignment(alignment)
+ *                     .setDataType(data_type)
+ *                     .build();
+ *  check_status(tensor.get_status());
+ *  ===========================================================================================
+ *
+ *  ~~~~~~~~~~~~~~~
+ *
+ *  Frontend API serves two major purpose as a companion to the backend API.
+ *  - Functional additions:
+ *      - Support for auto-tuning. (cudnnGet and cudnnFind)
+ *      - Errata filters.
+ *  - Programmatic ease:
+ *      - Easy memory management for the cudnnBackendDescriptor_t (RAII based classes).
+ *      - Error handling with optional exception support. Better error messages.
+ *      - Fewer lines of code (5-10x reduction in LOC).
+ *      - Simpler samples on how to use the new API.
+ */
+
+#include <cudnn.h>
+
+#include "cudnn_frontend_ConvDesc.h"
+#include "cudnn_frontend_Heuristics.h"
+#include "cudnn_frontend_Engine.h"
+#include "cudnn_frontend_EngineConfig.h"
+#include "cudnn_frontend_EngineFallbackList.h"
+#include "cudnn_frontend_Errata.h"
+#include "cudnn_frontend_ExecutionPlan.h"
+#include "cudnn_frontend_Filters.h"
+#include "cudnn_frontend_Operation.h"
+#include "cudnn_frontend_OperationGraph.h"
+#include "cudnn_frontend_Tensor.h"
+#include "cudnn_frontend_VariantPack.h"
+#include "cudnn_frontend_PointWiseDesc.h"
+#include "cudnn_frontend_MatMulDesc.h"
+#include "cudnn_frontend_Logging.h"
+#include "cudnn_frontend_Reorder_Tensor.h"
+#include "cudnn_frontend_ExecutionPlanCache.h"
+#include "cudnn_frontend_utils.h"
+
+#include "cudnn_frontend_Resample.h"
+
+#include "cudnn_frontend/graph_interface.h"
+#include "cudnn_frontend/utils/serialize.h"
+#include "cudnn_frontend/backend/kernel_cache.h"
+#include "cudnn_frontend/utils/attn_score_modifiers.h"
+#include "cudnn_frontend/backend/device_properties.h"
+
+#include "cudnn_frontend_version.h"
+
+namespace cudnn_frontend {
+using ConvDesc                  = ConvDesc_v8;
+using ConvDescBuilder           = ConvDescBuilder_v8;
+using ReductionDesc             = ReductionDesc_v8;
+using ReductionDescBuilder      = ReductionDescBuilder_v8;
+using EngineHeuristicsBuilder   = EngineHeuristicsBuilder_v8;
+using EngineHeuristics          = EngineHeuristics_v8;
+using EngineBuilder             = EngineBuilder_v8;
+using Engine                    = Engine_v8;
+using EngineConfig              = EngineConfig_v8;
+using EngineConfigBuilder       = EngineConfigBuilder_v8;
+using EngineFallbackList        = EngineFallbackList_v8;
+using EngineFallbackListBuilder = EngineFallbackListBuilder_v8;
+using ResampleDesc              = ResampleDesc_v8;
+using ResampleDescBuilder       = ResampleDescBuilder_v8;
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/backend/backend_descriptor.h b/third_party/cudnn-frontend/include/cudnn_frontend/backend/backend_descriptor.h
new file mode 100644
index 00000000..2cd68c0f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/backend/backend_descriptor.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <memory>
+
+#include "../graph_helpers.h"
+#include "cudnn.h"
+
+namespace cudnn_frontend::detail {
+
+/**
+ * @brief RAII wrapper around a `cudnnBackendDescriptor_t` object.
+ *
+ * This class provides a convenient way to manage the lifetime of a `cudnnBackendDescriptor_t`
+ * object using the RAII (Resource Acquisition Is Initialization) idiom. It automatically
+ * creates the descriptor when the object is constructed and destroys it when the object
+ * is destroyed, ensuring proper resource management and preventing memory leaks.
+ *
+ * @note The constructor of this class does not throw exceptions. Instead, it stores the
+ * status of the descriptor creation operation in the `status` member variable. Callers
+ * should check this status and handle any errors accordingly.
+ */
+class backend_descriptor {
+   public:
+    /**
+     * @brief Constructs a `backend_descriptor` object.
+     *
+     * @param type The type of the backend descriptor to create.
+     */
+    backend_descriptor(cudnnBackendDescriptorType_t type) { status = detail::create_descriptor(type, &desc); }
+
+    /**
+     * @brief Move constructor.
+     *
+     * Transfers the ownership of the `cudnnBackendDescriptor_t` object to the new
+     * `backend_descriptor` instance.
+     *
+     * @param other The source `backend_descriptor` object.
+     */
+    backend_descriptor(backend_descriptor&& other) noexcept : desc(other.desc), status(other.status) {
+        other.desc   = nullptr;
+        other.status = CUDNN_STATUS_NOT_INITIALIZED;
+    }
+
+    /**
+     * @brief Move assignment operator.
+     *
+     * Transfers the ownership of the `cudnnBackendDescriptor_t` object to the new
+     * `backend_descriptor` instance.
+     *
+     * @param other The source `backend_descriptor` object.
+     * @return A reference to the current `backend_descriptor` object.
+     */
+    backend_descriptor&
+    operator=(backend_descriptor&& other) noexcept {
+        if (this != &other) {
+            desc   = other.desc;
+            status = other.status;
+
+            other.desc = nullptr;
+        }
+        return *this;
+    }
+
+    /**
+     * @brief Destructor.
+     *
+     * Destroys the `cudnnBackendDescriptor_t` object and frees the associated resources.
+     */
+    ~backend_descriptor() {
+        if (desc) {
+            detail::destroy_descriptor(desc);
+        }
+    }
+
+    /**
+     * @brief Deleted copy constructor and assignment operator.
+     *
+     * `backend_descriptor` objects are not copyable to prevent unintended resource
+     * sharing and potential memory leaks.
+     */
+    backend_descriptor(backend_descriptor const&) = delete;
+    backend_descriptor&
+    operator=(backend_descriptor const&) = delete;
+
+    /**
+     * @brief Initializes a `backend_descriptor` object.
+     *
+     * @param type The type of the backend descriptor to create.
+     */
+    error_t
+    initialize(cudnnBackendDescriptorType_t type) {
+        _CUDNN_CHECK_CUDNN_ERROR(detail::create_descriptor(type, &desc));
+        return {error_code_t::OK, ""};
+    }
+
+    /**
+     * @brief Finalizes a `backend_descriptor` object.
+     *
+     */
+    error_t
+    finalize() {
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(desc));
+        return {error_code_t::OK, ""};
+    }
+
+    /**
+     * @brief Accessor for the underlying `cudnnBackendDescriptor_t` object.
+     *
+     * @return A const reference to `cudnnBackendDescriptor_t`, the raw pointer to the backend descriptor.
+     */
+    cudnnBackendDescriptor_t const&
+    get_ptr() const {
+        return desc;
+    }
+
+    /**
+     * @brief Accessor for the status of the backend descriptor creation.
+     *
+     * @return `cudnnStatus_t` The status of the backend descriptor creation operation.
+     */
+    cudnnStatus_t
+    get_status() const {
+        return status;
+    }
+
+    /**
+     * @brief Constructs a default `backend_descriptor` object, but without initializing descriptor
+     *
+     * Used to return an error code to user for incorrect cuDNN version
+     */
+    backend_descriptor() = default;
+
+   private:
+    cudnnBackendDescriptor_t desc = nullptr;                       //!< Raw pointer to the backend descriptor.
+    cudnnStatus_t status          = CUDNN_STATUS_NOT_INITIALIZED;  //!< Status of the descriptor creation operation.
+};
+
+}  // namespace cudnn_frontend::detail
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/backend/device_properties.h b/third_party/cudnn-frontend/include/cudnn_frontend/backend/device_properties.h
new file mode 100644
index 00000000..d31c5550
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/backend/device_properties.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "../graph_helpers.h"
+#include "backend_descriptor.h"
+
+namespace cudnn_frontend {
+///
+/// DeviceProperties Class
+/// Wraps the device_properties backend descriptor
+/// Wraps backend utility functions for user's convenience
+/// Backend accessor functions: size()
+/// Contains internal utilities for device properties finalization and operation graph attributes
+///
+class DeviceProperties : public detail::backend_descriptor {
+   public:
+    // Uses the default backend constructor so that we can check for initialization error during build()
+    DeviceProperties() = default;
+
+    std::string
+    describe() const {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR : " << std::endl;
+        return ss.str();
+    }
+
+    inline DeviceProperties&
+    set_device_id(int32_t device_id) {
+        this->device_id = device_id;
+        return *this;
+    }
+
+    inline DeviceProperties&
+    set_handle(cudnnHandle_t handle) {
+        this->handle = handle;
+        return *this;
+    }
+
+    // Used to check device properties status (particularly after initialization)
+    error_t
+    status() const {
+        if (get_status() != CUDNN_STATUS_SUCCESS) {
+            return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                    "CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR: Check CUDNN_VERSION >= 9.8"};
+        }
+        return {};
+    }
+
+    error_t
+    serialize(std::vector<uint8_t>& serialization_buf) const {
+#if (CUDNN_VERSION >= 90800)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90800,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION is only available starting 9.8.");
+
+        int64_t serializationSize;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(
+            get_ptr(), CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION, CUDNN_TYPE_CHAR, 0, &serializationSize, nullptr));
+        serialization_buf.resize(static_cast<size_t>(serializationSize));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(get_ptr(),
+                                                       CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION,
+                                                       CUDNN_TYPE_CHAR,
+                                                       serializationSize,
+                                                       &serializationSize,
+                                                       serialization_buf.data()));
+        return {};
+#else
+        (void)serialization_buf;
+        return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                "CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION is only available starting 9.8."};
+#endif
+    }
+
+    error_t
+    deserialize(const std::vector<uint8_t>& serialized_buf) {
+#if (CUDNN_VERSION >= 90800)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90800,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION is only available starting 9.8.");
+
+        // Check if the device properties is already initialized
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            get_ptr() != nullptr, error_code_t::CUDNN_BACKEND_API_FAILED, "Device properties is already initialized.");
+
+        // Initialize the device properties descriptor
+        CHECK_CUDNN_FRONTEND_ERROR(initialize(CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(get_ptr(),
+                                                       CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION,
+                                                       CUDNN_TYPE_CHAR,
+                                                       serialized_buf.size(),
+                                                       serialized_buf.data()));
+
+        CHECK_CUDNN_FRONTEND_ERROR(finalize());
+        return {};
+#else
+        (void)serialized_buf;
+        return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                "CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION is only available starting 9.8."};
+#endif
+    }
+
+    // Check for both compile-time and runtime cuDNN version
+    error_t
+    build() {
+#if (CUDNN_VERSION >= 90800)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90800,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR is only available starting 9.8.");
+        if (get_ptr() == nullptr) {
+            CHECK_CUDNN_FRONTEND_ERROR(initialize(CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR));
+        }
+
+        if (handle != nullptr) {
+            _CUDNN_CHECK_CUDNN_ERROR(
+                detail::set_attribute(get_ptr(), CUDNN_ATTR_DEVICEPROP_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle));
+        }
+
+        if (device_id >= 0) {
+            _CUDNN_CHECK_CUDNN_ERROR(
+                detail::set_attribute(get_ptr(), CUDNN_ATTR_DEVICEPROP_DEVICE_ID, CUDNN_TYPE_INT32, 1, &device_id));
+        }
+
+        CHECK_CUDNN_FRONTEND_ERROR(finalize());
+        return {};
+#else
+        return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                "CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR is only available starting 9.8."};
+#endif
+    }
+
+   private:
+    cudnnHandle_t handle = nullptr;
+    int32_t device_id    = 0;
+};
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/backend/execution_helpers.h b/third_party/cudnn-frontend/include/cudnn_frontend/backend/execution_helpers.h
new file mode 100644
index 00000000..cfc139e6
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/backend/execution_helpers.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <vector>
+
+#include "cudnn.h"
+
+#include "backend_descriptor.h"
+
+namespace cudnn_frontend::detail {
+/**
+ * @brief Creates a CUDNN backend variant pack descriptor.
+ *
+ * This function creates a `backend_descriptor` object representing a CUDNN backend variant pack
+ * descriptor. The variant pack descriptor is configured with the provided device pointers, unique
+ * IDs, and a workspace pointer.
+ *
+ * @param[out] variant_pack The created `backend_descriptor` object representing the variant pack.
+ * @param device_ptrs A vector of device pointers to be associated with the variant pack.
+ * @param uids A vector of unique IDs to be associated with the variant pack.
+ * @param workspace_ptr A pointer to the workspace memory to be associated with the variant pack.
+ * @return `error_t` A tuple containing the error code and an optional error message.
+ *         The error code is `error_code_t::OK` on success, or an appropriate error code on failure.
+ */
+inline error_t
+create_variant_pack(backend_descriptor& variant_pack,
+                    std::vector<void*>& device_ptrs,
+                    std::vector<int64_t> const& uids,
+                    void* workspace_ptr) {
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+        variant_pack.get_ptr(), CUDNN_ATTR_VARIANT_PACK_WORKSPACE, CUDNN_TYPE_VOID_PTR, 1, &workspace_ptr));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(variant_pack.get_ptr(),
+                                                   CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
+                                                   CUDNN_TYPE_VOID_PTR,
+                                                   device_ptrs.size(),
+                                                   device_ptrs.data()));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+        variant_pack.get_ptr(), CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS, CUDNN_TYPE_INT64, uids.size(), uids.data()));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(variant_pack.get_ptr()));
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+create_variant_pack(backend_descriptor& variant_pack,
+                    std::vector<void*>& device_ptrs,
+                    std::vector<int64_t> const& uids,
+                    void* workspace_ptr,
+                    std::vector<int64_t> const& override_uids,
+                    std::vector<std::vector<int64_t>> const& override_shapes,
+                    std::vector<std::vector<int64_t>> const& override_strides) {
+    auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Dynamic shapes requires cuDNN v9.18.0"};
+
+    NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91800, cudnn_ver_error);
+
+    CUDNN_FRONTEND_UNUSED(override_uids);
+    CUDNN_FRONTEND_UNUSED(override_shapes);
+    CUDNN_FRONTEND_UNUSED(override_strides);
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+        variant_pack.get_ptr(), CUDNN_ATTR_VARIANT_PACK_WORKSPACE, CUDNN_TYPE_VOID_PTR, 1, &workspace_ptr));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(variant_pack.get_ptr(),
+                                                   CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
+                                                   CUDNN_TYPE_VOID_PTR,
+                                                   device_ptrs.size(),
+                                                   device_ptrs.data()));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+        variant_pack.get_ptr(), CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS, CUDNN_TYPE_INT64, uids.size(), uids.data()));
+
+#if (CUDNN_VERSION >= 91800)
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(variant_pack.get_ptr(),
+                                                   CUDNN_ATTR_VARIANT_PACK_OVERRIDE_UNIQUE_IDS,
+                                                   CUDNN_TYPE_INT64,
+                                                   override_uids.size(),
+                                                   override_uids.data()));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(variant_pack.get_ptr(),
+                                                   CUDNN_ATTR_VARIANT_PACK_OVERRIDE_SHAPES,
+                                                   CUDNN_TYPE_VOID_PTR,
+                                                   1,
+                                                   (void*)&override_shapes));
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(variant_pack.get_ptr(),
+                                                   CUDNN_ATTR_VARIANT_PACK_OVERRIDE_STRIDES,
+                                                   CUDNN_TYPE_VOID_PTR,
+                                                   1,
+                                                   (void*)&override_strides));
+#endif
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(variant_pack.get_ptr()));
+
+    return {error_code_t::OK, ""};
+}
+
+}  // namespace cudnn_frontend::detail
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/backend/kernel_cache.h b/third_party/cudnn-frontend/include/cudnn_frontend/backend/kernel_cache.h
new file mode 100644
index 00000000..ef173b2f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/backend/kernel_cache.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "../graph_helpers.h"
+#include "backend_descriptor.h"
+
+namespace cudnn_frontend {
+namespace graph {
+class Graph;
+}  // namespace graph
+///
+/// KernelCache Class
+/// Wraps the kernel_cache backend descriptor
+/// Wraps backend utility functions for user's convenience
+/// Backend accessor functions: size()
+/// Contains internal utilities for kernel cache finalization and operation graph attributes
+///
+class KernelCache : public detail::backend_descriptor {
+   public:
+    friend class graph::Graph;
+    // Uses the default backend constructor so that we can check for initialization error during build()
+    KernelCache() : backend_descriptor() {}
+
+    std::string
+    describe() const {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR : " << std::endl;
+        return ss.str();
+    }
+
+    bool
+    is_finalized() {
+        return finalized;
+    }
+
+    // Used to check kernel cache status (particularly after initialization)
+    error_t
+    status() {
+        if (get_status() != CUDNN_STATUS_SUCCESS) {
+            return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                    "CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR: Check CUDNN_VERSION >= 9.4"};
+        }
+        return {};
+    }
+
+    error_t
+    to_json(std::string &str_json) const {
+        str_json.clear();
+#if (CUDNN_VERSION >= 91000)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 91000,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION is only available starting 9.10.");
+
+        int64_t serializationSize;
+        std::vector<char> serialization_buf;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(
+            get_ptr(), CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION, CUDNN_TYPE_CHAR, 0, &serializationSize, nullptr));
+        serialization_buf.resize(static_cast<size_t>(serializationSize));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(get_ptr(),
+                                                       CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION,
+                                                       CUDNN_TYPE_CHAR,
+                                                       serializationSize,
+                                                       &serializationSize,
+                                                       serialization_buf.data()));
+        std::string json_string(serialization_buf.begin(), serialization_buf.end());
+        str_json = std::move(json_string);
+        return {};
+#else
+        (void)str_json;
+        return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                "CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION is only available starting 9.10."};
+#endif
+    }
+
+    error_t
+    from_json(const std::string &json_cache) {
+#if (CUDNN_VERSION >= 91000)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 91000,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION is only available starting 9.10.");
+
+        // Check if the kernel cache is already initialized
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            get_ptr() != nullptr, error_code_t::CUDNN_BACKEND_API_FAILED, "Kernel cache is already initialized.");
+
+        // // Initialize the kernel cache descriptor
+        CHECK_CUDNN_FRONTEND_ERROR(initialize(CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR));
+
+        std::vector<char> serialization_buf;
+        serialization_buf.assign(json_cache.begin(), json_cache.end());
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(get_ptr(),
+                                                       CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION,
+                                                       CUDNN_TYPE_CHAR,
+                                                       serialization_buf.size(),
+                                                       serialization_buf.data()));
+        return {};
+#else
+        (void)json_cache;
+        return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                "CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION is only available starting 9.10."};
+#endif
+    }
+
+    // Responsible for initializing, setting operation graph attribute, and finalizing kernel cache
+    // Check for both compile-time and runtime cuDNN version
+    error_t
+    build(cudnnBackendDescriptor_t op_graph) {
+#if (CUDNN_VERSION >= 90400)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90400,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR is only available starting 9.4.");
+        if (get_ptr() == nullptr) {
+            CHECK_CUDNN_FRONTEND_ERROR(initialize(CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR));
+        }
+#if (CUDNN_VERSION >= 90500)
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90500,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH is only available starting 9.5.");
+        if (op_graph) {
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+                get_ptr(), CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
+        }
+#else
+        (void)op_graph;
+#endif
+        CHECK_CUDNN_FRONTEND_ERROR(finalize());
+        finalized = true;
+        return {};
+#else
+        (void)op_graph;
+        return {error_code_t::CUDNN_BACKEND_API_FAILED,
+                "CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR is only available starting 9.4."};
+#endif
+    }
+
+   private:
+    bool finalized = false;
+};
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/backend/plan_helpers.h b/third_party/cudnn-frontend/include/cudnn_frontend/backend/plan_helpers.h
new file mode 100644
index 00000000..4ef47e22
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/backend/plan_helpers.h
@@ -0,0 +1,212 @@
+#pragma once
+
+#include <vector>
+
+#include "cudnn.h"
+
+#include "backend_descriptor.h"
+#include "../knobs.h"
+
+namespace cudnn_frontend::detail {
+/**
+ * @brief Creates a CUDNN backend variant pack descriptor.
+ *
+ * This function creates a `backend_descriptor` object representing a CUDNN backend variant pack
+ * descriptor. The variant pack descriptor is configured with the provided device pointers, unique
+ * IDs, and a workspace pointer.
+ *
+ * @param[out] variant_pack The created `backend_descriptor` object representing the variant pack.
+ * @param device_ptrs A vector of device pointers to be associated with the variant pack.
+ * @param uids A vector of unique IDs to be associated with the variant pack.
+ * @param workspace_ptr A pointer to the workspace memory to be associated with the variant pack.
+ * @return `error_t` A tuple containing the error code and an optional error message.
+ *         The error code is `error_code_t::OK` on success, or an appropriate error code on failure.
+ */
+inline error_t
+get_workspace_size(ManagedOpaqueDescriptor& engine_config, int64_t& workspace) {
+#if CUDNN_VERSION >= 90200
+    _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(engine_config->get_backend_descriptor(),
+                                                   CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   nullptr,
+                                                   &workspace));
+    return {error_code_t::OK, ""};
+#else
+    (void)engine_config;
+    (void)workspace;
+    return {error_code_t::CUDNN_BACKEND_API_FAILED,
+            "CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE is only available starting 9.2."};
+#endif
+}
+
+inline error_t
+get_shared_memory_size(ManagedOpaqueDescriptor& engine_config, int32_t& shared_memory_size) {
+#if CUDNN_VERSION >= 90200
+    _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(engine_config->get_backend_descriptor(),
+                                                   CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED,
+                                                   CUDNN_TYPE_INT32,
+                                                   1,
+                                                   nullptr,
+                                                   &shared_memory_size));
+    return {error_code_t::OK, ""};
+#else
+    (void)engine_config;
+    (void)shared_memory_size;
+    return {error_code_t::CUDNN_BACKEND_API_FAILED,
+            "CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED is only available starting 9.2."};
+#endif
+}
+
+inline error_t
+create_engine(backend_descriptor& engine,
+              int64_t const engine_id,
+              cudnnBackendDescriptor_t op_graph,
+              std::shared_ptr<const DeviceProperties> device_properties = nullptr) {
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+        engine.get_ptr(), CUDNN_ATTR_ENGINE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph));
+
+    // Validate before setting
+    int64_t count;
+    _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(
+        op_graph, CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT, CUDNN_TYPE_INT64, 1, nullptr, &count));
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        engine_id >= count || engine_id < 0, error_code_t::INVALID_VALUE, "Invalid engine id.");
+
+    _CUDNN_CHECK_CUDNN_ERROR(
+        detail::set_attribute(engine.get_ptr(), CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &engine_id));
+
+    if (device_properties != nullptr) {
+#if (CUDNN_VERSION >= 90800)
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(engine.get_ptr(),
+                                                       CUDNN_ATTR_ENGINE_DEVICEPROP,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &device_properties->get_ptr()));
+#endif
+    }
+
+    _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(engine.get_ptr()));
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+query_knobs(int64_t const engine_id, cudnnBackendDescriptor_t op_graph, std::vector<Knob>& knobs) {
+    detail::backend_descriptor engine(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+    RETURN_CUDNN_FRONTEND_ERROR_IF(engine.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::CUDNN_BACKEND_API_FAILED,
+                                   "Failed to create engine's backend descriptor.");
+    CHECK_CUDNN_FRONTEND_ERROR(detail::create_engine(engine, engine_id, op_graph));
+
+    // Initialize a backend descriptor for each knob type
+    // The size of the array should be CUDNN_KNOB_TYPE_COUNTS, as currently we dont know how many knobs the engine will
+    // support
+    std::array<backend_descriptor, CUDNN_KNOB_TYPE_COUNTS> frontend_knobs;
+    for (size_t i = 0; i < CUDNN_KNOB_TYPE_COUNTS; i++) {
+        backend_descriptor frontend_knob(CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(frontend_knob.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "Failed to create knob's backend descriptor.");
+        frontend_knobs[i] = std::move(frontend_knob);
+    }
+
+    // Create an auxillary array to hold the raw knob descriptors
+    std::array<cudnnBackendDescriptor_t, CUDNN_KNOB_TYPE_COUNTS> backend_knobs;
+    for (size_t i = 0; i < CUDNN_KNOB_TYPE_COUNTS; i++) {
+        backend_knobs[i] = frontend_knobs[i].get_ptr();
+    }
+
+    // This is the actual number of knobs that is supported by the engine
+    int64_t knobs_size;
+    _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(engine.get_ptr(),
+                                                   CUDNN_ATTR_ENGINE_KNOB_INFO,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   CUDNN_KNOB_TYPE_COUNTS,
+                                                   &knobs_size,
+                                                   backend_knobs.data()));
+
+    for (int64_t i = 0; i < knobs_size; i++) {
+        cudnnBackendKnobType_t type;
+        int64_t elemCount;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(
+            frontend_knobs[i].get_ptr(), CUDNN_ATTR_KNOB_INFO_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, &elemCount, &type));
+
+        int64_t maxValue;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(frontend_knobs[i].get_ptr(),
+                                                       CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &elemCount,
+                                                       &maxValue));
+
+        int64_t minValue;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(frontend_knobs[i].get_ptr(),
+                                                       CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &elemCount,
+                                                       &minValue));
+
+        int64_t stride;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(
+            frontend_knobs[i].get_ptr(), CUDNN_ATTR_KNOB_INFO_STRIDE, CUDNN_TYPE_INT64, 1, &elemCount, &stride));
+
+        auto frontend_knob_type = convert_from_backend_knob_type(type);
+        knobs.emplace_back(frontend_knob_type, maxValue, minValue, stride);
+    }
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+set_knob_choices(std::unordered_map<KnobType_t, int64_t> const& user_choices,
+                 std::vector<detail::backend_descriptor>& knob_choices) {
+    for (auto const& [type, choice] : user_choices) {
+        backend_descriptor knob_choice(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(knob_choice.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "Failed to create knob_choice's backend descriptor.");
+
+        cudnnBackendKnobType_t backend_type;
+        _CUDNN_CHECK_CUDNN_ERROR(convert_to_backend_knob_type(type, backend_type));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            knob_choice.get_ptr(), CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, &backend_type));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            knob_choice.get_ptr(), CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE, CUDNN_TYPE_INT64, 1, &choice));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(knob_choice.get_ptr()));
+
+        knob_choices.push_back(std::move(knob_choice));
+    }
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+create_engine_config(ManagedOpaqueDescriptor& engine_config,
+                     backend_descriptor& engine,
+                     std::vector<detail::backend_descriptor>& knob_choices) {
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(engine_config->get_backend_descriptor(),
+                                                   CUDNN_ATTR_ENGINECFG_ENGINE,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(engine.get_ptr())));
+
+    std::vector<cudnnBackendDescriptor_t> backend_knob_choices(CUDNN_KNOB_TYPE_COUNTS);
+    for (size_t i = 0; i < knob_choices.size(); i++) {
+        backend_knob_choices[i] = knob_choices[i].get_ptr();
+    }
+    _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(engine_config->get_backend_descriptor(),
+                                                   CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   knob_choices.size(),
+                                                   backend_knob_choices.data()));
+
+    // Finalizing the descriptor
+    _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(engine_config->get_backend_descriptor()));
+
+    return {error_code_t::OK, ""};
+}
+
+}  // namespace cudnn_frontend::detail
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/context.h b/third_party/cudnn-frontend/include/cudnn_frontend/context.h
new file mode 100644
index 00000000..8c894b22
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/context.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include "../cudnn_frontend_utils.h"
+
+namespace cudnn_frontend::detail {
+
+class Context {
+    DataType_t compute_data_type      = DataType_t::NOT_SET;
+    DataType_t intermediate_data_type = DataType_t::NOT_SET;
+    DataType_t io_data_type           = DataType_t::NOT_SET;
+    int32_t target_sm_count           = -1;
+    int32_t target_sm_version         = -1;
+    bool is_dynamic_shape_enabled     = false;
+
+    std::string name = "";
+
+   public:
+    Context&
+    set_intermediate_data_type(DataType_t const type) {
+        intermediate_data_type = type;
+        return *this;
+    }
+
+    Context&
+    set_io_data_type(DataType_t const type) {
+        io_data_type = type;
+        return *this;
+    }
+
+    Context&
+    set_compute_data_type(DataType_t const type) {
+        compute_data_type = type;
+        return *this;
+    }
+
+    DataType_t
+    get_io_data_type() const {
+        return io_data_type;
+    }
+
+    DataType_t
+    get_intermediate_data_type() const {
+        return intermediate_data_type;
+    }
+
+    DataType_t
+    get_compute_data_type() const {
+        return compute_data_type;
+    }
+
+    Context&
+    set_name(std::string const& name_) {
+        name = name_;
+        return *this;
+    }
+
+    std::string
+    get_name() const {
+        return name;
+    }
+
+    Context&
+    set_target_sm_count(int32_t count) {
+        target_sm_count = count;
+        return *this;
+    }
+
+    Context&
+    set_sm_version(int32_t version) {
+        target_sm_version = version;
+        return *this;
+    }
+
+    Context&
+    set_dynamic_shape_enabled(bool is_enabled) {
+        is_dynamic_shape_enabled = is_enabled;
+        return *this;
+    }
+
+    bool
+    get_dynamic_shape_enabled() const {
+        return is_dynamic_shape_enabled;
+    }
+
+    int32_t
+    get_target_sm_count() const {
+        return target_sm_count;
+    }
+
+    int32_t
+    get_sm_version() const {
+        return target_sm_version;
+    }
+
+    Context&
+    fill_missing_properties(Context const& global_context) {
+        if (get_compute_data_type() == DataType_t::NOT_SET) {
+            set_compute_data_type(global_context.get_compute_data_type());
+        }
+        if (get_intermediate_data_type() == DataType_t::NOT_SET) {
+            set_intermediate_data_type(global_context.get_intermediate_data_type());
+        }
+        if (get_io_data_type() == DataType_t::NOT_SET) {
+            set_io_data_type(global_context.get_io_data_type());
+        }
+        return *this;
+    }
+};
+
+}  // namespace cudnn_frontend::detail
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/cudnn_interface.h b/third_party/cudnn-frontend/include/cudnn_frontend/cudnn_interface.h
new file mode 100644
index 00000000..c142c61f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/cudnn_interface.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+#include "../cudnn_frontend_Tensor.h"
+#include "../cudnn_frontend_Operation.h"
+#include "../cudnn_frontend_OperationGraph.h"
+#include "../cudnn_frontend_EngineConfig.h"
+#include "../cudnn_frontend_ExecutionPlan.h"
+#include "../cudnn_frontend_VariantPack.h"
+
+#include "graph_properties.h"
+#include "graph_helpers.h"
+#include "plans.h"
+
+namespace cudnn_frontend {
+
+namespace detail {
+inline void
+assign_uid(graph::Tensor_attributes* const tensor,
+           int64_t& potential_uid,
+           std::unordered_set<int64_t> const& used_uids) {
+    // get_next_potential_uid
+    while (used_uids.find(potential_uid) != used_uids.end()) {
+        ++potential_uid;
+    }
+
+    tensor->set_uid(potential_uid);
+    ++potential_uid;  // increment, as used its used now
+}
+
+// TODO: Always returns OK. Can the status and error message be accessed from tensor descriptor?
+inline error_t
+create_cudnn_tensor(
+    std::shared_ptr<graph::Tensor_attributes> const& props,
+    std::unordered_map<graph::Tensor_attributes::uid_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+    int64_t& potential_uid,
+    std::unordered_set<int64_t> const& used_uids) {
+    // Assign tensor a uid
+    if (props->has_uid() == false) {
+        assign_uid(props.get(), potential_uid, used_uids);
+    }
+
+    // Check whether backend tensor already created
+    auto tensor_uid = props->get_uid();
+    if (tensors.find(tensor_uid) != tensors.end()) {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:      Backend Tensor named '" << props->get_name() << "' with UID " << tensor_uid
+                                                                    << " already created.");
+        return {error_code_t::OK, ""};
+    }
+    CUDNN_FE_LOG_LABEL_ENDL("INFO:      Backend Tensor named '" << props->get_name() << "' with UID " << tensor_uid
+                                                                << " being created.");
+
+    auto&& tensor_builder = cudnn_frontend::TensorBuilder();
+
+    tensor_builder.setDim(props->get_dim().size(), props->get_dim().data())
+        .setStrides(props->get_stride().size(), props->get_stride().data())
+        .setId(tensor_uid)
+        .setAlignment(props->get_alignment())
+        .setDataType(props->get_data_type())
+        .setVirtual(props->get_is_virtual())
+        .setByValue(props->get_is_pass_by_value())
+        .setReorderType(props->get_reordering_type());
+
+    // Set vector count and dimension if they are non-default
+    if (props->get_vector_count() > 1 || props->get_vector_dimension() >= 0) {
+        tensor_builder.setVectorCountAndDimension(props->get_vector_count(), props->get_vector_dimension());
+    }
+
+    if (auto ragged_offset_props = props->get_ragged_offset()) {
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, tensors, potential_uid, used_uids));
+        tensor_builder.setRaggedOffset(tensors.at(ragged_offset_props->get_uid()));
+    }
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    auto tensor = tensor_builder.build();
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        tensor.get_status() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, tensor.get_error());
+    tensors.emplace(tensor_uid, std::make_shared<Tensor>(std::move(tensor)));
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        auto tensor = tensor_builder.build();
+        tensors.emplace(tensor_uid, std::make_shared<Tensor>(std::move(tensor)));
+    } catch (cudnn_frontend::cudnnException& e) {
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
+    }
+#endif
+
+    return {error_code_t::OK, ""};
+}
+}  // namespace detail
+
+class ICudnn {
+   protected:
+    using uid_t = int64_t;
+
+    //// Store tensors and operations as they (probably?) need to be kept alive.
+    //
+    // The tensor mapping from fe::Tensor to be::Tensor.
+    //
+    // sub nodes share fe::Tensor. Example, in a conv-bias graph, conv output Y and bias input IN_0 are the same
+    // fe::Tensor. But both sub ndoes need to work together to make sure only one be::Tensor is created. Hence this
+    // uid_to_backend_tensors acts as the global registry for each sub node to use.
+    //
+    // Key cannot be fe::Tensor, or shared_ptr<fe::Tensor>, or underlying object address of fe::Tensor.
+    // Hence using uid, as that uniquely identifies both types of tensors.
+    std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>> uid_to_tensors;
+    std::vector<std::shared_ptr<cudnn_frontend::Operation>> operations;
+    graph::managed_backend_descriptor_t raw_operations;
+
+    std::shared_ptr<OperationGraph_v8> operation_graph;
+    std::unordered_set<graph::Tensor_attributes::uid_t> variant_pack_uids;
+
+    graph::Execution_plan_list plans;
+
+    bool is_dynamic_shape_enabled             = false;
+    std::shared_ptr<KernelCache> kernel_cache = nullptr;
+
+    std::shared_ptr<const DeviceProperties> device_properties = nullptr;
+
+    error_t
+    create_cudnn_operation_graph(cudnnHandle_t handle) {
+        std::vector<Operation const*> cudnn_operations;
+        for (std::shared_ptr<cudnn_frontend::Operation> operation : operations) {
+            cudnn_operations.push_back(operation.get());
+        }
+
+        auto&& cudnn_operation_graph_builder = cudnn_frontend::OperationGraphBuilder();
+        cudnn_operation_graph_builder.setHandle(handle)
+            .setOperationGraph(cudnn_operations.size(), cudnn_operations.data())
+            .setIsDynamicShapeEnabled(is_dynamic_shape_enabled);
+        for (auto& op : raw_operations) {
+            cudnn_operation_graph_builder.addOperation(op);
+        }
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+        // disable exception macro is defined. Calling build will not throw.
+        // Check status of desc and return error.
+        auto cudnn_operation_graph = cudnn_operation_graph_builder.build();
+        RETURN_CUDNN_FRONTEND_ERROR_IF(cudnn_operation_graph.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       cudnn_operation_graph.get_error());
+        operation_graph = std::make_shared<OperationGraph_v8>(std::move(cudnn_operation_graph));
+#else
+        // build() can throw
+        // wrap in try catch
+        try {
+            auto cudnn_operation_graph = cudnn_operation_graph_builder.build();
+            operation_graph            = std::make_shared<OperationGraph_v8>(std::move(cudnn_operation_graph));
+        } catch (cudnn_frontend::cudnnException& e) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
+        }
+#endif
+        return {error_code_t::OK, "Successfully built Operation Graph."};
+    }
+
+   public:
+    error_t
+    get_cudnn_workspace_size_node(int64_t const plan_index, int64_t& cudnn_workspace_size) const {
+        CHECK_CUDNN_FRONTEND_ERROR(plans.is_plan_index_executable(plan_index));
+
+        cudnn_workspace_size = std::max(cudnn_workspace_size, plans.execution_plans[plan_index]->getWorkspaceSize());
+
+        return {error_code_t::OK, ""};
+    }
+
+    int64_t
+    get_max_cudnn_workspace_size_node() const {
+        return plans.get_autotune_workspace();
+    }
+
+    error_t
+    execute_cudnn_plan_with_uid(cudnnHandle_t handle,
+                                std::unordered_map<int64_t, void*> const& tensor_uid_to_pointer_map,
+                                void* workspace_ptr,
+                                int64_t plan_index,
+                                std::vector<int64_t> const& override_uids,
+                                std::vector<std::vector<int64_t>> const& override_shapes,
+                                std::vector<std::vector<int64_t>> const& override_strides) const {
+        // Make sure device pointer is provided for all uids expected for this plan
+        std::vector<void*> device_ptrs;
+        std::vector<uid_t> uids;
+        for (auto const& uid : variant_pack_uids) {
+            auto search = tensor_uid_to_pointer_map.find(uid);
+            RETURN_CUDNN_FRONTEND_ERROR_IF(search == tensor_uid_to_pointer_map.end(),
+                                           error_code_t::INVALID_VARIANT_PACK,
+                                           "Uid " + std::to_string(uid) + " does not exist in variant pack.");
+            device_ptrs.push_back(search->second);
+            uids.push_back(uid);
+        }
+
+        CHECK_CUDNN_FRONTEND_ERROR(plans.is_plan_index_executable(plan_index));
+
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Executing plan at index " << plan_index
+                                                                 << " with override uids: " << override_uids.size());
+
+        if (override_uids.size() == 0) {
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::execute(handle, plans.execution_plans[plan_index].get(), device_ptrs, uids, workspace_ptr));
+        } else {
+            CHECK_CUDNN_FRONTEND_ERROR(detail::execute(handle,
+                                                       plans.execution_plans[plan_index].get(),
+                                                       device_ptrs,
+                                                       uids,
+                                                       workspace_ptr,
+                                                       override_uids,
+                                                       override_shapes,
+                                                       override_strides));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+};
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/graph_helpers.h b/third_party/cudnn-frontend/include/cudnn_frontend/graph_helpers.h
new file mode 100644
index 00000000..9cf45c16
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/graph_helpers.h
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+#include <iomanip>
+#include <unordered_set>
+#include <algorithm>
+#include <string>
+#include <numeric>
+
+namespace cudnn_frontend {
+
+enum class [[nodiscard]] error_code_t {
+    OK,
+    ATTRIBUTE_NOT_SET,
+    SHAPE_DEDUCTION_FAILED,
+    INVALID_TENSOR_NAME,
+    INVALID_VARIANT_PACK,
+    GRAPH_NOT_SUPPORTED,
+    GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+    GRAPH_EXECUTION_FAILED,
+    HEURISTIC_QUERY_FAILED,
+    UNSUPPORTED_GRAPH_FORMAT,
+    CUDA_API_FAILED,
+    CUDNN_BACKEND_API_FAILED,
+    INVALID_CUDA_DEVICE,
+    HANDLE_ERROR,
+    INVALID_VALUE
+};
+
+typedef struct [[nodiscard]] error_object {
+    error_code_t code;
+    std::string err_msg;
+    error_object() : code(error_code_t::OK), err_msg("") {};
+    error_object(error_code_t err, std::string msg) : code(err), err_msg(msg) {};
+
+    error_code_t
+    get_code() {
+        return code;
+    }
+
+    std::string
+    get_message() {
+        return err_msg;
+    }
+
+    bool
+    is_good() const {
+        return code == error_code_t::OK;
+    }
+
+    bool
+    is_bad() const {
+        return !is_good();
+    }
+
+    bool
+    operator==(error_code_t compare_code) {
+        return code == compare_code;
+    }
+
+    bool
+    operator!=(error_code_t compare_code) {
+        return code != compare_code;
+    }
+
+} error_t;
+
+#ifdef WIN32
+#define CUDNN_FRONTEND_WHILE_FALSE \
+    __pragma(warning(push)) __pragma(warning(disable : 4127)) while (0) __pragma(warning(pop))
+#else
+#define CUDNN_FRONTEND_WHILE_FALSE while (0)
+#endif
+
+#define CHECK_CUDNN_FRONTEND_ERROR(x)                                                          \
+    do {                                                                                       \
+        if (auto retval = x; retval.is_bad()) {                                                \
+            CUDNN_FE_LOG_LABEL_ENDL("ERROR: " << #x << " at " << __FILE__ << ":" << __LINE__); \
+            return retval;                                                                     \
+        }                                                                                      \
+    }                                                                                          \
+    CUDNN_FRONTEND_WHILE_FALSE
+
+#define RETURN_CUDNN_FRONTEND_ERROR_IF(cond, retval, message)                                                      \
+    do {                                                                                                           \
+        if (cond) {                                                                                                \
+            if (retval == error_code_t::OK) {                                                                      \
+                CUDNN_FE_LOG_LABEL("INFO: ");                                                                      \
+            } else {                                                                                               \
+                CUDNN_FE_LOG_LABEL("ERROR: ");                                                                     \
+            }                                                                                                      \
+            CUDNN_FE_LOG(message << ". " << retval << " because (" << #cond ") at " << __FILE__ << ":" << __LINE__ \
+                                 << "\n");                                                                         \
+            return {retval, message};                                                                              \
+        }                                                                                                          \
+    }                                                                                                              \
+    CUDNN_FRONTEND_WHILE_FALSE
+
+#define _CUDNN_CHECK_CUDNN_ERROR(x)                                                                         \
+    do {                                                                                                    \
+        if (auto cudnn_retval = x; cudnn_retval != CUDNN_STATUS_SUCCESS) {                                  \
+            std::stringstream error_msg;                                                                    \
+            error_msg << #x << " failed with message: " << detail::get_last_error_string_()                 \
+                      << ", and code: " << detail::get_error_string(cudnn_retval);                          \
+            CUDNN_FE_LOG_LABEL_ENDL("ERROR: " << error_msg.str() << " at " << __FILE__ << ":" << __LINE__); \
+            return {error_code_t::CUDNN_BACKEND_API_FAILED, error_msg.str()};                               \
+        }                                                                                                   \
+    }                                                                                                       \
+    CUDNN_FRONTEND_WHILE_FALSE
+
+#define _CUDNN_CHECK_CUDA_ERROR(x)                                                                          \
+    do {                                                                                                    \
+        if (auto cuda_retval = x; cuda_retval != cudaSuccess) {                                             \
+            std::stringstream error_msg;                                                                    \
+            error_msg << #x << " failed with " << detail::cuda_get_error_string(cuda_retval);               \
+            CUDNN_FE_LOG_LABEL_ENDL("ERROR: " << error_msg.str() << " at " << __FILE__ << ":" << __LINE__); \
+            return {error_code_t::CUDA_API_FAILED, error_msg.str()};                                        \
+        }                                                                                                   \
+    }                                                                                                       \
+    CUDNN_FRONTEND_WHILE_FALSE
+
+#define CHECK_CU_ERROR(x)                                                                                         \
+    do {                                                                                                          \
+        if (auto cu_retval = x; cu_retval != CUDA_SUCCESS) {                                                      \
+            std::stringstream error_msg;                                                                          \
+            const char* error_code_string;                                                                        \
+            detail::cu_get_error_string(cu_retval, &error_code_string);                                           \
+            error_msg << #x << " failed with " << error_code_string;                                              \
+            getLogger() << "[cudnn_frontend] ERROR: " << error_msg.str() << " at " << __FILE__ << ":" << __LINE__ \
+                        << std::endl;                                                                             \
+            return {error_code_t::CUDA_API_FAILED, error_msg.str()};                                              \
+        }                                                                                                         \
+    }                                                                                                             \
+    CUDNN_FRONTEND_WHILE_FALSE
+
+NLOHMANN_JSON_SERIALIZE_ENUM(error_code_t,
+                             {
+                                 {error_code_t::OK, "OK"},
+                                 {error_code_t::ATTRIBUTE_NOT_SET, "ATTRIBUTE_NOT_SET"},
+                                 {error_code_t::SHAPE_DEDUCTION_FAILED, "SHAPE_DEDUCTION_FAILED"},
+                                 {error_code_t::INVALID_TENSOR_NAME, "INVALID_TENSOR_NAME"},
+                                 {error_code_t::INVALID_VARIANT_PACK, "INVALID_VARIANT_PACK"},
+                                 {error_code_t::GRAPH_NOT_SUPPORTED, "GRAPH_NOT_SUPPORTED"},
+                                 {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                  "GRAPH_EXECUTION_PLAN_CREATION_FAILED"},
+                                 {error_code_t::GRAPH_EXECUTION_FAILED, "GRAPH_EXECUTION_FAILED"},
+                                 {error_code_t::HEURISTIC_QUERY_FAILED, "HEURISTIC_QUERY_FAILED"},
+                                 {error_code_t::CUDNN_BACKEND_API_FAILED, "CUDNN_BACKEND_API_FAILED"},
+                                 {error_code_t::CUDA_API_FAILED, "CUDA_API_FAILED"},
+                                 {error_code_t::INVALID_CUDA_DEVICE, "INVALID_CUDA_DEVICE"},
+                                 {error_code_t::UNSUPPORTED_GRAPH_FORMAT, "UNSUPPORTED_GRAPH_FORMAT"},
+                                 {error_code_t::HANDLE_ERROR, "HANDLE_ERROR"},
+                                 {error_code_t::INVALID_VALUE, "INVALID_VALUE"},
+                             })
+
+static inline std::ostream&
+operator<<(std::ostream& os, const error_code_t& mode) {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    os << json{mode};
+#else
+    os << int(mode);
+#endif
+    return os;
+}
+
+static inline std::ostream&
+operator<<(std::ostream& os, cudnn_frontend::error_object& err) {
+    os << err.get_code() << err.get_message();
+    return os;
+}
+
+static bool
+allowAllConfig(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+namespace detail {
+
+inline bool
+is_activation_backward_mode(PointwiseMode_t const mode) {
+    return ((mode == PointwiseMode_t::RELU_BWD) || (mode == PointwiseMode_t::TANH_BWD) ||
+            (mode == PointwiseMode_t::SIGMOID_BWD) || (mode == PointwiseMode_t::ELU_BWD) ||
+            (mode == PointwiseMode_t::GELU_BWD) || (mode == PointwiseMode_t::GELU_APPROX_TANH_BWD) ||
+            (mode == PointwiseMode_t::SOFTPLUS_BWD) || (mode == PointwiseMode_t::SWISH_BWD));
+}
+
+// Creates dense, non-overlapping strides from given dim and stride_order.
+// For example, if a is a 4D tensor with dimensions labeled NCHW, then strided(a, (3, 0, 2, 1)) produces
+// strides where the C dimension has a corresponding stride of one.
+inline std::vector<int64_t>
+generate_stride(std::vector<int64_t> const& dim, std::vector<int64_t> const& stride_order) {
+    size_t num_dims = dim.size();
+    std::vector<int64_t> stride(num_dims);
+
+    // Sort the dimensions according to strides from least to greatest.
+    // Example, dim = (2, 3, 4, 5) stride_order = (3, 1, 2, 0)
+    // sorted_stride_order = ((0, (3, 5)), (1, (1, 3)), (2, (2, 4)), (3, (0, 2)))
+    std::vector<std::pair<int64_t, std::pair<size_t, size_t>>> sorted_stride_order;
+    for (size_t i = 0; i < num_dims; ++i) {
+        sorted_stride_order.push_back({stride_order[i], {i, dim[i]}});
+    }
+    std::sort(sorted_stride_order.begin(), sorted_stride_order.end());
+
+    // As dims have now been ordered starting from fastest changing,
+    // just fill in strides by iterating linearly over them.
+    int64_t product = 1;
+    for (size_t i = 0; i < num_dims; ++i) {
+        stride[sorted_stride_order[i].second.first] = product;
+        product *= sorted_stride_order[i].second.second;
+    }
+
+    return stride;
+}
+
+// Generate NHWC stride_order
+inline std::vector<int64_t>
+generate_NHWC_stride_order(int64_t const num_dims) {
+    std::vector<int64_t> stride_order(num_dims);
+
+    int64_t order   = 0;
+    stride_order[1] = order++;
+    for (size_t i = num_dims - 1; i > 1; --i) {
+        stride_order[i] = order++;
+    }
+    stride_order[0] = order;
+
+    return stride_order;
+}
+
+// Generate row major stride_order for matrices
+// dim = (*, M, N) where * is batch dimsensions
+// strides should be (..., N, 1)
+inline std::vector<int64_t>
+generate_row_major_stride_order(int64_t const num_dims) {
+    std::vector<int64_t> stride_order(num_dims);
+
+    int64_t order = num_dims - 1;
+    std::generate(stride_order.begin(), stride_order.end(), [&order] { return order--; });
+
+    return stride_order;
+}
+
+// Generate column major stride_order for matrices
+// dim = (*, M, N)
+// strides should be (*, 1, M)
+inline std::vector<int64_t>
+generate_column_major_stride_order(int64_t const num_dims) {
+    std::vector<int64_t> stride_order = generate_row_major_stride_order(num_dims);
+    if (num_dims > 2) {
+        std::swap(stride_order[num_dims - 1], stride_order[num_dims - 2]);
+    }
+    return stride_order;
+}
+
+/**
+ * @brief Computes the common shape with the fewest dimensions that all input shapes can be broadcast to.
+ *
+ * This function takes a vector of shapes and calculates a common shape that all input shapes
+ * can be broadcast to. It follows broadcasting rules similar to those used in NumPy.
+ *
+ * @param _shapes A vector of vectors, where each inner vector represents a shape.
+ *                Each shape is a sequence of dimension sizes.
+ * @param[out] common_shape The computed broadcast shape is stored in this vector.
+ *                          It will be cleared and resized as necessary.
+ *
+ * @return error_t An error code indicating the result of the operation
+ *
+ * @note
+ * - Shapes are processed from right to left (last dimension to first).
+ * - A dimension of size 1 can be broadcast to any size.
+ * - Non-1 dimensions must match exactly for broadcasting.
+ * - The resulting shape will have the maximum number of dimensions among all input shapes.
+ *
+ * @example
+ *   std::vector<std::vector<int64_t>> shapes = {{3, 1, 4}, {1, 2, 4}, {2, 4}};
+ *   std::vector<int64_t> result;
+ *   error_t err = compute_broadcast_shape(shapes, result);
+ *   // If err == error_code_t::OK, result will be {3, 2, 4}
+ */
+inline error_t
+compute_broadcast_shape(const std::vector<std::vector<int64_t>>& _shapes, std::vector<int64_t>& common_shape) {
+    // Filter out empty shapes
+    std::vector<std::vector<int64_t>> shapes;
+    std::copy_if(_shapes.begin(), _shapes.end(), std::back_inserter(shapes), [](const std::vector<int64_t>& shape) {
+        return !shape.empty();
+    });
+
+    // Short-circuits if there are no input shapes
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        shapes.empty(), error_code_t::SHAPE_DEDUCTION_FAILED, "All input shapes provided are empty.");
+
+    // Find the maximum dimension
+    int64_t max_dim = std::max_element(shapes.begin(),
+                                       shapes.end(),
+                                       [](const std::vector<int64_t>& a, const std::vector<int64_t>& b) {
+                                           return a.size() < b.size();
+                                       })
+                          ->size();
+
+    // Initialize common_shape with 1s
+    common_shape.assign(max_dim, 1);
+
+    for (const auto& shape : shapes) {
+        for (int idx = -1; idx >= -static_cast<int>(shape.size()); --idx) {
+            int64_t common_idx = common_shape.size() + idx;
+            int64_t shape_idx  = shape.size() + idx;
+
+            if (common_shape[common_idx] == 1) {
+                common_shape[common_idx] = shape[shape_idx];
+            }
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF((shape[shape_idx] != 1) && (common_shape[common_idx] != shape[shape_idx]),
+                                           error_code_t::SHAPE_DEDUCTION_FAILED,
+                                           "dimensions mismatch as broadcasting 2 non-one dimension sizes.");
+        }
+    }
+
+    return {error_code_t::OK, ""};
+}
+/**
+ * @brief Generates a stride order preserving the format of the input tensor.
+ *
+ * This function derives the exact stride order from the input tensor's strides.
+ * It returns the indices of the strides in ascending order of stride values.
+ *
+ * @param input_stride The stride of the input tensor
+ * @param output_dim_size The number of dimensions in the output tensor
+ * @return std::vector<int64_t> The generated stride order
+ */
+inline error_t
+generate_stride_order_preserving_format(const std::vector<int64_t>& input_stride,
+                                        size_t output_dim_size,
+                                        std::vector<int64_t>& stride_order) {
+    std::vector<int64_t> indices(input_stride.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    // Sort indices based on stride values in descending order
+    std::sort(indices.begin(), indices.end(), [&input_stride](int64_t i, int64_t j) {
+        return input_stride[i] < input_stride[j];
+    });
+
+    // Enable this after further debug
+    // std::set<int64_t> stride_set(input_stride.begin(), input_stride.end());
+    // RETURN_CUDNN_FRONTEND_ERROR_IF((stride_set.size() != input_stride.size()),
+    //                                error_code_t::SHAPE_DEDUCTION_FAILED,
+    //                                "Have multiple stride with same value. Cant determine stride order");
+
+    // Create the stride order
+    stride_order.resize(input_stride.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+        stride_order[indices[i]] = i;
+    }
+
+    // If output_dim_size is larger, pad with remaining dimensions
+    if (output_dim_size > input_stride.size()) {
+        size_t start = stride_order.size();
+        stride_order.resize(output_dim_size);
+        std::iota(stride_order.begin() + start, stride_order.end(), start);
+    }
+
+    return {error_code_t::OK, ""};
+}
+
+/**
+ * @brief Infers the output dimensions for a matrix multiplication operation.
+ *
+ * This function calculates the output dimensions of a matrix multiplication
+ * based on the input dimensions of tensors A and B. It uses compute_broadcast_shape
+ * for batch dimensions and ensures the last two dimensions are correct for matrix multiplication.
+ *
+ * @param a_dim Dimensions of the first input tensor (A).
+ * @param b_dim Dimensions of the second input tensor (B).
+ * @param output_dim Reference to the vector where the output dimensions will be stored.
+ * @return error_t An error code indicating the result of the operation.
+ */
+inline error_t
+generate_matmul_output_dim(const std::vector<int64_t>& a_dim,
+                           const std::vector<int64_t>& b_dim,
+                           std::vector<int64_t>& output_dim) {
+    // Ensure a_dim and b_dim have at least 2 dimensions
+    if (a_dim.size() < 2 || b_dim.size() < 2) {
+        return {error_code_t::SHAPE_DEDUCTION_FAILED, "Input tensors must have at least 2 dimensions for matmul."};
+    }
+
+    // Check if inner dimensions are compatible
+    if (a_dim[a_dim.size() - 1] != b_dim[b_dim.size() - 2]) {
+        return {error_code_t::SHAPE_DEDUCTION_FAILED,
+                "Inner dimensions of input tensors are not compatible for matmul."};
+    }
+
+    // Prepare shapes for broadcasting
+    std::vector<int64_t> a_batch_dim(a_dim.begin(), a_dim.end() - 2);
+    std::vector<int64_t> b_batch_dim(b_dim.begin(), b_dim.end() - 2);
+
+    // Compute broadcast shape for batch dimensions
+    std::vector<int64_t> broadcasted_batch;
+    CHECK_CUDNN_FRONTEND_ERROR(detail::compute_broadcast_shape({a_batch_dim, b_batch_dim}, broadcasted_batch));
+
+    // Construct final output shape
+    output_dim = broadcasted_batch;
+    output_dim.push_back(a_dim[a_dim.size() - 2]);  // M from A
+    output_dim.push_back(b_dim[b_dim.size() - 1]);  // N from B
+
+    return {error_code_t::OK, ""};
+}
+
+inline std::string
+to_hex(const void* data, size_t num_elements, size_t elem_size) {
+    const auto* bytes = static_cast<const unsigned char*>(data);
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < num_elements; ++i) {
+        if (i > 0) ss << ", ";
+        ss << "0x" << std::hex << std::uppercase;
+        switch (elem_size) {
+            case 1:
+                ss << static_cast<unsigned>(bytes[i]);
+                break;
+            case 2:
+                ss << *reinterpret_cast<const uint16_t*>(&bytes[i * 2]);
+                break;
+            case 4:
+                ss << *reinterpret_cast<const uint32_t*>(&bytes[i * 4]);
+                break;
+            case 8:
+                ss << *reinterpret_cast<const uint64_t*>(&bytes[i * 8]);
+                break;
+            default:
+                ss << "?";
+        }
+    }
+    ss << "]";
+    return ss.str();
+}
+
+inline std::string
+to_decimal(const void* data, size_t num_elements, size_t elem_size) {
+    const auto* bytes = static_cast<const unsigned char*>(data);
+    std::stringstream ss;
+    ss << "[";
+    for (size_t i = 0; i < num_elements; ++i) {
+        if (i > 0) ss << ", ";
+        switch (elem_size) {
+            case 1:
+                ss << static_cast<int>(bytes[i]);
+                break;
+            case 2:
+                ss << *reinterpret_cast<const int16_t*>(&bytes[i * 2]);
+                break;
+            case 4:
+                ss << *reinterpret_cast<const int32_t*>(&bytes[i * 4]);
+                break;
+            case 8:
+                ss << *reinterpret_cast<const int64_t*>(&bytes[i * 8]);
+                break;
+            default:
+                ss << "?";
+        }
+    }
+    ss << "]";
+    return ss.str();
+}
+
+inline std::string
+to_base64(const void* data, size_t total_bytes) {
+    static const char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    const auto* bytes         = static_cast<const unsigned char*>(data);
+    std::string result;
+    result.reserve(((total_bytes + 2) / 3) * 4);
+    for (size_t i = 0; i < total_bytes; i += 3) {
+        uint32_t n = static_cast<uint32_t>(bytes[i]) << 16;
+        if (i + 1 < total_bytes) n |= static_cast<uint32_t>(bytes[i + 1]) << 8;
+        if (i + 2 < total_bytes) n |= static_cast<uint32_t>(bytes[i + 2]);
+        result.push_back(table[(n >> 18) & 0x3F]);
+        result.push_back(table[(n >> 12) & 0x3F]);
+        result.push_back((i + 1 < total_bytes) ? table[(n >> 6) & 0x3F] : '=');
+        result.push_back((i + 2 < total_bytes) ? table[n & 0x3F] : '=');
+    }
+    return result;
+}
+
+inline error_t
+log_dump_tensor_content(int64_t uid,
+                        std::string const& name,
+                        void* ptr,
+                        size_t num_elements,
+                        size_t elem_size,
+                        char fmt,
+                        cudaStream_t stream) {
+    if (!isLoggingEnabled()) return {error_code_t::OK, ""};
+
+    size_t total_bytes = num_elements * elem_size;
+
+    cudaPointerAttributes attr;
+    _CUDNN_CHECK_CUDA_ERROR(cuda_pointer_get_attributes(&attr, ptr));
+
+    std::vector<unsigned char> host_buf(total_bytes);
+    if (attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged) {
+        _CUDNN_CHECK_CUDA_ERROR(cuda_mem_cpy_async(host_buf.data(), ptr, total_bytes, cudaMemcpyDeviceToHost, stream));
+        _CUDNN_CHECK_CUDA_ERROR(cuda_stream_synchronize(stream));
+    } else {
+        std::memcpy(host_buf.data(), ptr, total_bytes);
+    }
+
+    std::string data_str;
+    switch (fmt) {
+        case 'x':
+            data_str = to_hex(host_buf.data(), num_elements, elem_size);
+            break;
+        case 'd':
+            data_str = to_decimal(host_buf.data(), num_elements, elem_size);
+            break;
+        case 'b':
+            data_str = to_base64(host_buf.data(), total_bytes);
+            break;
+        default:
+            data_str = to_hex(host_buf.data(), num_elements, elem_size);
+    }
+    CUDNN_FE_LOG_LABEL_ENDL("Tensor Dump Uid: " << uid << " Name: " << name << " Data: " << data_str);
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+log_variant_pack_memory_type(int64_t uid, void* ptr) {
+    if (!isLoggingEnabled()) return {error_code_t::OK, ""};
+
+    cudaPointerAttributes attributes;
+    _CUDNN_CHECK_CUDA_ERROR(cuda_pointer_get_attributes(&attributes, ptr));
+
+    auto memory_type_to_string = [](cudaMemoryType type) {
+        switch (type) {
+            case cudaMemoryTypeHost:
+                return std::string("Host");
+            case cudaMemoryTypeDevice:
+                return std::string("Device");
+            case cudaMemoryTypeManaged:
+                return std::string("Managed");
+            case cudaMemoryTypeUnregistered:
+                return std::string("Unregistered");
+            default:
+                return "UNKNOWN cudaMemoryType (" + std::to_string(type) + ")";
+        }
+    };
+
+    auto ptr_to_string = [](void* p) {
+        std::stringstream ss;
+        ss << "0x" << std::hex << std::setw(sizeof(void*) * 2) << std::setfill('0') << reinterpret_cast<uintptr_t>(p);
+        return ss.str();
+    };
+
+    // clang-format off
+    CUDNN_FE_LOG_LABEL_ENDL("Variant Pack" << std::setw(0) << " Uid: " << std::setw(20) << uid
+                                           << std::setw(0) << " MemoryType: " << std::setw(12) << memory_type_to_string(attributes.type)
+                                           << std::setw(0) << " Device: " << std::setw(4) << attributes.device
+                                           << std::setw(0) << " UnifiedPtr: " << std::setw(20) << ptr_to_string(ptr)
+                                           << std::setw(0) << " DevicePtr: " << std::setw(20) << ptr_to_string(attributes.devicePointer)
+                                           << std::setw(0) << " HostPtr: " << std::setw(20) << ptr_to_string(attributes.hostPointer));
+    // clang-format on
+    return {error_code_t::OK, ""};
+}
+
+}  // namespace detail
+
+class cudnnGraphNotSupportedException : public std::runtime_error {
+   public:
+    cudnnGraphNotSupportedException(const char* message) throw() : std::runtime_error(message) {}
+
+    virtual const char*
+    what() const throw() {
+        return std::runtime_error::what();
+    }
+};
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/graph_interface.h b/third_party/cudnn-frontend/include/cudnn_frontend/graph_interface.h
new file mode 100644
index 00000000..74f699c4
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/graph_interface.h
@@ -0,0 +1,2708 @@
+#pragma once
+
+#include <unordered_map>
+#include <stdexcept>
+#include <string>
+
+#include "../cudnn_frontend_version.h"
+#include "node/batchnorm.h"
+#include "node/batchnorm_inference.h"
+#include "node/bn_finalize.h"
+#include "node/conv_fprop.h"
+#include "node/conv_dgrad.h"
+#include "node/conv_wgrad.h"
+#include "node/dbn.h"
+#include "node/dln.h"
+#include "node/dbn_weight.h"
+#include "node/genstats.h"
+#include "node/layernorm.h"
+#include "node/adaptive_layernorm.h"
+#include "node/instancenorm.h"
+#include "node/rmsnorm.h"
+#include "node/resample.h"
+#include "node/reshape.h"
+#include "node/slice.h"
+#include "node/scaled_dot_product_flash_attention.h"
+#include "node/sdpa_fp8_bwd.h"
+#include "node/block_scale_quantize.h"
+#include "node/block_scale_dequantize.h"
+#include "node/concatenate.h"
+#include "node/moe_grouped_matmul.h"
+
+#include "backend/backend_descriptor.h"
+#include "plans.h"
+#include "knobs.h"
+#include "graph_helpers.h"
+#include "backend/kernel_cache.h"
+
+namespace cudnn_frontend::graph {
+
+class Graph : public ICudnn, public INode {
+   private:
+    std::unordered_set<std::shared_ptr<Tensor_attributes>> full_graph_inputs;
+    std::unordered_set<Tensor_attributes::uid_t> used_uids;
+    int64_t fe_workspace_size = 0;
+
+    std::unordered_set<std::shared_ptr<Tensor_attributes>> deserialized_tensor_properties;
+    std::unordered_map<uid_t, pass_by_values_t> deserialized_pass_by_value;
+    std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> deserialized_workspace_modifications;
+
+    // Cached values computed during build/deserialize, used during execute to avoid repeated collection.
+    // These are mutable because execute() is const but needs non-const access for pointer extraction.
+    mutable std::unordered_map<uid_t, pass_by_values_t> cached_pass_by_value;
+    mutable std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> cached_workspace_modifications;
+
+    // char: 'x'=hex, 'd'=decimal, 'b'=base64
+    std::vector<std::pair<std::shared_ptr<Tensor_attributes>, char>> tensors_to_dump;
+
+    error_t
+    get_pre_assigned_uids(std::unordered_set<Tensor_attributes::uid_t> &used_uids) {
+        for (auto const &input : full_graph_inputs) {
+            if (input->has_uid()) {
+                auto uid  = input->get_uid();
+                auto iter = used_uids.find(uid);
+                RETURN_CUDNN_FRONTEND_ERROR_IF(iter != used_uids.end(),
+                                               error_code_t::INVALID_VALUE,
+                                               "uid " + std::to_string(uid) + " for tensor named " + input->get_name() +
+                                                   " has been already assigned to another tensor.");
+                used_uids.insert(uid);
+            }
+        }
+        for (auto const &output : full_graph_outputs) {
+            if (output->has_uid()) {
+                auto uid  = output->get_uid();
+                auto iter = used_uids.find(uid);
+                RETURN_CUDNN_FRONTEND_ERROR_IF(iter != used_uids.end(),
+                                               error_code_t::INVALID_VALUE,
+                                               "uid " + std::to_string(uid) + " for tensor named " +
+                                                   output->get_name() +
+                                                   " has been already assigned to another tensor.");
+                used_uids.insert(uid);
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (context.get_dynamic_shape_enabled() || kernel_cache != nullptr) && detail::get_backend_version() < 90400,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Dynamic shapes or kernel caching enabled, but cuDNN version < 9.4!");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(((context.get_dynamic_shape_enabled() == false) && (kernel_cache != nullptr)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Kernel caching enabled but dynamic shapes is disabled");
+        if (detail::get_backend_version() != detail::get_compiled_version()) {
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: The cuDNN version used at compilation ("
+                                    << detail::get_compiled_version() << ") and the one used at runtime ("
+                                    << detail::get_backend_version() << ") differ.");
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    post_validate_node() const override final {
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    collect_pass_by_value_tensors_node(
+        std::unordered_map<uid_t, pass_by_values_t> &pass_by_values) const override final {
+        for (auto [uid, value] : deserialized_pass_by_value) {
+            pass_by_values.emplace(uid, value);
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    collect_tensors_in_workspace_node(
+        std::unordered_map<Tensor_attributes::uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>
+            &worskspace_modifications,
+        int64_t &) const override {
+        for (auto [uid, value] : deserialized_workspace_modifications) {
+            worskspace_modifications.emplace(uid, value);
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    create_cudnn_tensors_node(std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>> &,
+                              int64_t &,
+                              std::unordered_set<int64_t> const &) const override final {
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    extend_tensor_map_with_workspace_tensors_(
+        std::unordered_map<int64_t, void *> &tensor_to_pointer_map,
+        void *workspace,
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> const &worskspace_modifications)
+        const {
+        for (auto const &[uid, data] : worskspace_modifications) {
+            tensor_to_pointer_map.emplace(uid, static_cast<char *>(workspace) + std::get<1>(data));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    extend_tensor_map_with_pass_by_value_tensors_(
+        std::unordered_map<int64_t, void *> &tensor_to_pointer_map,
+        std::unordered_map<uid_t, pass_by_values_t> &tensor_to_pass_by_value) const {
+        for (auto &[uid, value] : tensor_to_pass_by_value) {
+            if (half *half_value_ptr = std::get_if<half>(&value)) {
+                tensor_to_pointer_map.emplace(uid, half_value_ptr);
+            } else if (nv_bfloat16 *nv_bfloat16_value_ptr = std::get_if<nv_bfloat16>(&value)) {
+                tensor_to_pointer_map.emplace(uid, nv_bfloat16_value_ptr);
+            } else if (int32_t *int32_t_value_ptr = std::get_if<int32_t>(&value)) {
+                tensor_to_pointer_map.emplace(uid, int32_t_value_ptr);
+            } else if (int64_t *int64_t_value_ptr = std::get_if<int64_t>(&value)) {
+                tensor_to_pointer_map.emplace(uid, int64_t_value_ptr);
+            } else if (float *float_value_ptr = std::get_if<float>(&value)) {
+                tensor_to_pointer_map.emplace(uid, float_value_ptr);
+            } else {
+                RETURN_CUDNN_FRONTEND_ERROR_IF(
+                    true, error_code_t::INVALID_VARIANT_PACK, "Unexpected type for pass by value tensor.");
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    make_variant_pack_replacements(
+        std::unordered_map<int64_t, void *> &tensor_to_pointer_map,
+        std::unordered_map<Tensor_attributes::uid_t, std::pair<Tensor_attributes::uid_t, int64_t>> replacements) const {
+        for (auto &[from_uid, value] : replacements) {
+            const auto &[to_uid, start_offset] = value;
+
+            // Check if from_uid exists in the map
+            auto it = tensor_to_pointer_map.find(from_uid);
+            RETURN_CUDNN_FRONTEND_ERROR_IF(it == tensor_to_pointer_map.end(),
+                                           error_code_t::INVALID_VARIANT_PACK,
+                                           "Variant pack expected uid " + std::to_string(from_uid) + " but not found.");
+
+            // Perform pointer arithmetic
+            tensor_to_pointer_map[to_uid] = static_cast<void *>(static_cast<char *>(it->second) + start_offset);
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    int64_t
+    get_max_cudnn_workspace_size() const {
+        return get_max_cudnn_workspace_size_node();
+    }
+
+    // Key: uid to replace in variant pack
+    // Value: uid to replace with, start offset to add to pointer
+    std::unordered_map<Tensor_attributes::uid_t, std::pair<Tensor_attributes::uid_t, int64_t>>
+        variant_pack_replacements;
+
+    error_t
+    run_auxiliary_kernels(
+        cudnnHandle_t handle,
+        void *fe_workspace,
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> &workspace_modifications) const {
+        cudaStream_t stream;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_stream(handle, &stream));
+        char *workspace = static_cast<char *>(fe_workspace);
+
+        for (auto [uid, data] : workspace_modifications) {
+            (void)uid;
+            if (std::get<0>(data) == 0) {
+                auto &vec_data = std::get<2>(data);
+                _CUDNN_CHECK_CUDA_ERROR(detail::cuda_mem_cpy_async(workspace + std::get<1>(data),
+                                                                   vec_data.data(),
+                                                                   vec_data.size() * sizeof(float),
+                                                                   cudaMemcpyHostToDevice,
+                                                                   stream));
+            } else if (std::get<0>(data) == 1) {
+                int64_t memset_size = (int64_t)std::get<2>(data)[0];
+                _CUDNN_CHECK_CUDA_ERROR(
+                    detail::cuda_mem_set_async(workspace + std::get<1>(data), 0, memset_size, stream));
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    size_t
+    key(bool remove_shape) {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        json j;
+        serialize(j);
+        if (remove_shape) {
+            for (auto &tensor : j["tensors"]) {
+                tensor["dim"].clear();
+                tensor["stride"].clear();
+            }
+        }
+        return std::hash<json>{}(j);
+#else
+        CUDNN_FRONTEND_UNUSED(remove_shape);
+        return 1;
+#endif
+    }
+
+    // Private unified sdpa method - internal implementation for both FP16 and FP8 modes
+    inline SDPA_attributes::SDPA_outputs
+    sdpa_internal(std::shared_ptr<Tensor_attributes> q,
+                  std::shared_ptr<Tensor_attributes> k,
+                  std::shared_ptr<Tensor_attributes> v,
+                  SDPA_attributes &&attributes) {
+        // Set inputs
+        attributes.inputs[SDPA_attributes::input_names::Q] = q;
+        attributes.inputs[SDPA_attributes::input_names::K] = k;
+        attributes.inputs[SDPA_attributes::input_names::V] = v;
+
+        // Make required output tensors
+        SDPA_attributes::SDPA_outputs sdpa_outputs;
+
+        sdpa_outputs.O = attributes.outputs[SDPA_attributes::output_names::O] = output_tensor(attributes.name + "::O");
+
+        if (attributes.generate_stats == true) {
+            sdpa_outputs.Stats = attributes.outputs[SDPA_attributes::output_names::Stats] =
+                output_tensor(attributes.name + "::Stats");
+        }
+
+        // Dropout mask dump (created conditionally based on dropout parameters)
+        if (attributes.outputs.find(SDPA_attributes::output_names::RNG_DUMP) != attributes.outputs.end() &&
+            attributes.outputs.at(SDPA_attributes::output_names::RNG_DUMP) != nullptr) {
+            sdpa_outputs.RNG_DUMP = attributes.outputs[SDPA_attributes::output_names::RNG_DUMP];
+        }
+
+        // FP8-specific outputs (created conditionally based on FP8 scaling parameters)
+        if (attributes.inputs.find(SDPA_attributes::input_names::Descale_S) != attributes.inputs.end() &&
+            attributes.inputs.at(SDPA_attributes::input_names::Descale_S) != nullptr) {
+            sdpa_outputs.Amax_S = attributes.outputs[SDPA_attributes::output_names::Amax_S] =
+                output_tensor(attributes.name + "::Amax_S");
+        }
+        if (attributes.inputs.find(SDPA_attributes::input_names::Scale_O) != attributes.inputs.end() &&
+            attributes.inputs.at(SDPA_attributes::input_names::Scale_O) != nullptr) {
+            sdpa_outputs.Amax_O = attributes.outputs[SDPA_attributes::output_names::Amax_O] =
+                output_tensor(attributes.name + "::Amax_O");
+        }
+
+        auto seq_len_q_it  = attributes.inputs.find(SDPA_attributes::input_names::SEQ_LEN_Q);
+        auto seq_len_kv_it = attributes.inputs.find(SDPA_attributes::input_names::SEQ_LEN_KV);
+        if (seq_len_q_it != attributes.inputs.end() && seq_len_q_it->second != nullptr) {
+            tensors_to_dump.emplace_back(seq_len_q_it->second, 'd');
+        }
+        if (seq_len_kv_it != attributes.inputs.end() && seq_len_kv_it->second != nullptr) {
+            tensors_to_dump.emplace_back(seq_len_kv_it->second, 'd');
+        }
+
+        for (auto t : {q, k, v, sdpa_outputs.O}) {
+            if (auto ragged = t->get_ragged_offset()) {
+                tensors_to_dump.emplace_back(ragged, 'd');
+            }
+        }
+
+        if (attributes.implementation == AttentionImplementation_t::AUTO) {
+            // Sets attributes.implementation to a supporting implementation,
+            // or leaves as AUTO if none found
+            attributes._auto_select_implementation(context);
+        }
+
+        switch (attributes.implementation) {
+            case AttentionImplementation_t::AUTO:
+                throw std::runtime_error("No suitable implementation for given SDPA_attributes");
+                break;
+            case AttentionImplementation_t::COMPOSITE:
+                sub_nodes.emplace_back(std::make_unique<CompositeSDPANode>(std::move(attributes), context));
+                break;
+            case AttentionImplementation_t::UNIFIED:
+                sub_nodes.emplace_back(std::make_unique<UnifiedSDPANode>(std::move(attributes), context));
+                break;
+        }
+
+        return sdpa_outputs;
+    }
+
+   public:
+    Graph() : INode(detail::Context{}) {}
+
+    error_t
+    update_cuda_graph(cudnnHandle_t handle,
+                      std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> &tensor_to_pointer_map,
+                      void *workspace,
+                      cudaGraph_t cudnn_cuda_graph) {
+        // First get all the uids from the map
+        std::unordered_map<Tensor_attributes::uid_t, void *> tensor_uid_to_pointer_map;
+        tensor_uid_to_pointer_map.reserve(tensor_to_pointer_map.size());
+        for (auto const &[tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return update_cuda_graph(handle, tensor_uid_to_pointer_map, workspace, cudnn_cuda_graph);
+    }
+
+    error_t
+    update_cuda_graph(cudnnHandle_t handle,
+                      std::unordered_map<Tensor_attributes::uid_t, void *> &uid_to_device_ptrs,
+                      void *workspace,
+                      cudaGraph_t cudnn_cuda_graph) {
+        // Initializes this cudnn graph
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            cudnn_cuda_graph == nullptr, error_code_t::INVALID_VALUE, "cudnn_cuda_graph should not be a nullptr");
+
+        size_t num_root_nodes;
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_get_root_nodes(cudnn_cuda_graph, nullptr, &num_root_nodes));
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            num_root_nodes != 1, error_code_t::INVALID_VALUE, "cudnn_cuda_graph should have exactly 1 root node.");
+
+        cudaGraphNode_t current_node = nullptr;
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_get_root_nodes(cudnn_cuda_graph, &current_node, &num_root_nodes));
+
+        ///////////////////////////////////////
+        //// PASS BY VALUE TENSOR HANDLING ////
+        ///////////////////////////////////////
+        // Add pass_by_value data pointers to uid_to_pointer map.
+        // Using cached values to avoid repeated tree traversal overhead.
+        // cuda graph will keep a copy of the kernel parameters, meaning that at the time of
+        // launching the cuda_graph executable, cached values being deallocated does not affect these cpu values.
+        // No cuda graph nodes are required for handling fe owned pass by value tensors.
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(uid_to_device_ptrs, cached_pass_by_value));
+
+        ////////////////////////////
+        //// WORKSPACE HANDLING ////
+        ////////////////////////////
+        // Using cached workspace modifications to avoid repeated tree traversal.
+        for (auto const &[uid, data] : cached_workspace_modifications) {
+            const auto &[operation_type, offset, vec_data] = data;
+            uid_to_device_ptrs[uid]                        = static_cast<char *>(workspace) + offset;
+
+            // 0 means memcpy
+            if (operation_type == 0) {
+                _CUDNN_CHECK_CUDA_ERROR(
+                    detail::cuda_graph_add_memcpy_node_set_params_1D(current_node,
+                                                                     static_cast<char *>(workspace) + offset,
+                                                                     vec_data.data(),
+                                                                     vec_data.size() * sizeof(float),
+                                                                     cudaMemcpyHostToDevice));
+            }
+            // 1 means memset
+            else if (operation_type == 1) {
+                // offset from workspace
+                void *device_ptr    = static_cast<char *>(workspace) + offset;
+                int64_t memset_size = static_cast<int64_t>(vec_data[0]);
+
+                cudaMemsetParams params;
+                params.dst         = device_ptr;
+                params.elementSize = sizeof(char);
+                params.value       = 0x0;
+                params.width       = memset_size;
+                params.height      = 1;  // 1D memset currently
+                params.pitch       = 0;  // unused
+
+                _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_add_memset_node_set_params(current_node, &params));
+            }
+            // Other values do not correspond to CUDA graph nodes
+            else {
+                continue;
+            }
+
+            size_t num_dependent_nodes;
+            _CUDNN_CHECK_CUDA_ERROR(
+                detail::cuda_graph_node_get_dependent_nodes(current_node, nullptr, &num_dependent_nodes));
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                num_dependent_nodes != 1,
+                error_code_t::INVALID_VALUE,
+                "Each node of cudnn_cuda_graph before the backend graph node should have exactly 1 dependent node.");
+            _CUDNN_CHECK_CUDA_ERROR(
+                detail::cuda_graph_node_get_dependent_nodes(current_node, &current_node, &num_dependent_nodes));
+        }
+
+        // Make sure device pointer is provided for all uids expected for this plan
+        std::vector<void *> device_ptrs;
+        std::vector<uid_t> uids;
+
+        device_ptrs.reserve(variant_pack_uids.size());
+        uids.reserve(variant_pack_uids.size());
+
+        for (auto const &uid : variant_pack_uids) {
+            auto search = uid_to_device_ptrs.find(uid);
+            RETURN_CUDNN_FRONTEND_ERROR_IF(search == uid_to_device_ptrs.end(),
+                                           error_code_t::INVALID_VARIANT_PACK,
+                                           "Uid " + std::to_string(uid) + " does not exist in variant pack.");
+            device_ptrs.push_back(search->second);
+            uids.push_back(uid);
+        }
+
+        ///////////////////
+        //// BE GRAPH ////
+        ///////////////////
+        cudaGraph_t backend_cuda_graph;
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_child_graph_node_get_graph(current_node, &backend_cuda_graph));
+
+        detail::backend_descriptor variant_pack_descriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack_descriptor.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "Failed to create variant pack's backend descriptor.");
+
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+        CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, cudnn_workspace));
+
+        int64_t candidate = plans.candidate;
+        CHECK_CUDNN_FRONTEND_ERROR(plans.is_plan_index_executable(candidate));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::update_cuda_graph(handle,
+                                                           plans.execution_plans[candidate]->get_raw_desc(),
+                                                           variant_pack_descriptor.get_ptr(),
+                                                           backend_cuda_graph));
+
+        // There should be nothing after the backend graph
+        size_t num_dependent_nodes;
+        _CUDNN_CHECK_CUDA_ERROR(
+            detail::cuda_graph_node_get_dependent_nodes(current_node, nullptr, &num_dependent_nodes));
+        RETURN_CUDNN_FRONTEND_ERROR_IF(num_dependent_nodes != 0,
+                                       error_code_t::INVALID_VALUE,
+                                       "cudnn_cuda_graph should have no graph nodes after the backend graph node.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    populate_cuda_graph(cudnnHandle_t handle,
+                        std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> &tensor_to_pointer_map,
+                        void *workspace,
+                        cudaGraph_t cudnn_cuda_graph) {
+        // First get all the uids from the map
+        std::unordered_map<Tensor_attributes::uid_t, void *> tensor_uid_to_pointer_map;
+        tensor_uid_to_pointer_map.reserve(tensor_to_pointer_map.size());
+        for (auto const &[tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return populate_cuda_graph(handle, tensor_uid_to_pointer_map, workspace, cudnn_cuda_graph);
+    }
+
+    error_t
+    populate_cuda_graph(cudnnHandle_t handle,
+                        std::unordered_map<Tensor_attributes::uid_t, void *> &uid_to_device_ptrs,
+                        void *workspace,
+                        cudaGraph_t cudnn_cuda_graph) {
+        // Check if the cuda graph is empty
+        size_t numNodes = 0;
+        CHECK_CU_ERROR(detail::cu_graph_get_nodes(cudnn_cuda_graph, nullptr, &numNodes));
+        RETURN_CUDNN_FRONTEND_ERROR_IF(numNodes != 0,
+                                       error_code_t::INVALID_VALUE,
+                                       "cuda graph provided to populate is not empty. cuDNN requires it to be empty "
+                                       "for the corresponding update APIs to work correctly.");
+
+        // This function makes linear cuda graphs. And that makes it easy to walk
+        // the graph when updating it.
+        // So just keeping track of the last node in the cuda graph is sufficient.
+        cudaGraphNode_t last_node = nullptr;
+
+        ///////////////////////////////////////
+        //// PASS BY VALUE TENSOR HANDLING ////
+        ///////////////////////////////////////
+        // Add pass_by_value data pointers to uid_to_pointer map.
+        // Using cached values to avoid repeated tree traversal overhead.
+        // cuda graph will keep a copy of the kernel parameters, meaning that at the time of
+        // launching the cuda_graph executable, cached values being deallocated does not affect these cpu values.
+        // No cuda graph nodes are required for handling fe owned pass by value tensors.
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(uid_to_device_ptrs, cached_pass_by_value));
+
+        /////////////////////////////////
+        //// WORKSPACE HANDLING ////
+        /////////////////////////////////
+        // Using cached workspace modifications to avoid repeated tree traversal.
+        for (auto const &[uid, data] : cached_workspace_modifications) {
+            const auto &[operation_type, offset, vec_data] = data;
+            uid_to_device_ptrs[uid]                        = static_cast<char *>(workspace) + offset;
+
+            cudaGraphNode_t node = nullptr;
+
+            // 0 means memcpy
+            if (operation_type == 0) {
+                _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_add_memcpy_node_1D(&node,
+                                                                              cudnn_cuda_graph,
+                                                                              &last_node,
+                                                                              last_node != nullptr,
+                                                                              static_cast<char *>(workspace) + offset,
+                                                                              vec_data.data(),
+                                                                              vec_data.size() * sizeof(float),
+                                                                              cudaMemcpyHostToDevice));
+            }
+            // 1 means memset
+            else if (operation_type == 1) {
+                // offset from workspace
+                void *device_ptr    = static_cast<char *>(workspace) + offset;
+                int64_t memset_size = static_cast<int64_t>(vec_data[0]);
+
+                cudaMemsetParams params;
+                params.dst         = device_ptr;
+                params.elementSize = sizeof(char);
+                params.value       = 0x0;
+                params.width       = memset_size;
+                params.height      = 1;  // 1D memset currently
+                params.pitch       = 0;  // unused
+
+                _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_add_memset_node(
+                    &node, cudnn_cuda_graph, &last_node, last_node != nullptr, &params));
+            }
+            // Other values do not correspond to CUDA graph nodes
+            else {
+                continue;
+            }
+
+            last_node = node;
+        }
+
+        //////////////
+        // BE graph //
+        //////////////
+
+        // Get the BE's cuda graph
+
+        // Make sure device pointer is provided for all uids expected for this plan
+        std::vector<void *> device_ptrs;
+        device_ptrs.reserve(variant_pack_uids.size());
+        std::vector<uid_t> uids;
+        uids.reserve(variant_pack_uids.size());
+        for (auto const &uid : variant_pack_uids) {
+            auto search = uid_to_device_ptrs.find(uid);
+            RETURN_CUDNN_FRONTEND_ERROR_IF(search == uid_to_device_ptrs.end(),
+                                           error_code_t::INVALID_VARIANT_PACK,
+                                           "Uid " + std::to_string(uid) + " does not exist in variant pack.");
+            device_ptrs.push_back(search->second);
+            uids.push_back(uid);
+        }
+
+        // Create the variant pack to pass to backend
+        detail::backend_descriptor variant_pack_descriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack_descriptor.get_status() != CUDNN_STATUS_SUCCESS,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "Failed to create variant pack's backend descriptor.");
+
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+        CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, cudnn_workspace));
+
+        // Get the plan candidate. It only makes to sense to make cuda graph after execution plan has been built.
+        // And in that case the candidate would have been set.
+        int64_t candidate = plans.candidate;
+        CHECK_CUDNN_FRONTEND_ERROR(plans.is_plan_index_executable(candidate));
+
+        // Finally get the backend cuda graph.
+        cudaGraph_t backend_cuda_graph;
+        // Initialize the cudnn cuda graph.
+        // The responsibility to destroy is on the user.
+        detail::cu_graph_create(&backend_cuda_graph, 0);  // 0 is just what the API says to pass
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::populate_cuda_graph(handle,
+                                                             plans.execution_plans[candidate]->get_raw_desc(),
+                                                             variant_pack_descriptor.get_ptr(),
+                                                             backend_cuda_graph));
+
+        // Clone BE graph into a graph_node
+        // This same call also places the newly created into FE's graph
+        // TODO: BE graph is at the end, so put in appropriate dependencies
+        cudaGraphNode_t backend_cuda_graph_node;
+        detail::cuda_graph_add_child_graph_node(
+            &backend_cuda_graph_node, cudnn_cuda_graph, &last_node, last_node != nullptr, backend_cuda_graph);
+
+        // Destroy the BE graph as it now has been cloned into a node
+        // It was initialized by internals of backend, but the responsibility to destroy it is on FE.
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_destroy(backend_cuda_graph));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    validate() {
+        CUDNN_FE_LOG_BANNER("  VALIDATING GRAPH  ");
+        CUDNN_FE_LOG(*this << std::endl;);
+
+        // First validate all inputs that the user set.
+        for (auto const &input : full_graph_inputs) {
+            CHECK_CUDNN_FRONTEND_ERROR(input->validate());
+        }
+
+        // Validate the nodes, which in turn also infers missing tensor attributes.
+        CHECK_CUDNN_FRONTEND_ERROR(validate_subtree());
+        // Validate all outputs, which should now have everything set to be lowered to backend.
+        for (auto const &output : full_graph_outputs) {
+            CHECK_CUDNN_FRONTEND_ERROR(output->validate());
+        }
+
+        // Get all the pre assigned uids
+        CHECK_CUDNN_FRONTEND_ERROR(get_pre_assigned_uids(used_uids));
+        // Clear state
+        used_uids.clear();
+
+        CUDNN_FE_LOG_BANNER("  VALIDATED ALL OK  ");
+
+        return {error_code_t::OK, ""};
+    }
+
+    // overload for deviceless AoT compilation
+    error_t
+    build_operation_graph() {
+        CUDNN_FE_LOG_BANNER("  BUILD OP GRAPH WITHOUT HANDLE  ");
+
+        if (device_properties == nullptr) {
+            return {error_code_t::ATTRIBUTE_NOT_SET, "Device properties are not set."};
+        }
+        CUDNN_FE_LOG_BANNER("  BUILT OP GRAPH WITHOUT HANDLE  ");
+        return build_operation_graph(nullptr);
+    }
+
+    error_t
+    build_operation_graph(cudnnHandle_t handle) {
+        CUDNN_FE_LOG_BANNER("  BUILD OP GRAPH  ");
+
+        CUDNN_FE_LOG_BANNER("  1/4 INFER PROPERTIES OF NODES  ");
+
+        // expand composite nodes
+        CHECK_CUDNN_FRONTEND_ERROR(expand_subtree());
+
+        // Get all the pre assigned uids
+        CHECK_CUDNN_FRONTEND_ERROR(get_pre_assigned_uids(used_uids));
+
+        CUDNN_FE_LOG_BANNER("  2/4 CREATE TENSORS  ");
+
+        Tensor_attributes::uid_t start_uid = 1;
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors_subtree(uid_to_tensors, start_uid, used_uids));
+
+        CUDNN_FE_LOG_BANNER("  3/4 CREATE OPERATIONS  ");
+        // INode keeps track of all uids that an operation graph uses.
+        // This helps to return errors to user during execution, without relying on backend to do so.
+        // Also, as uid in a variant pack have to be unique, keep a set of them.
+        CHECK_CUDNN_FRONTEND_ERROR(
+            create_cudnn_operations(variant_pack_uids, operations, raw_operations, uid_to_tensors));
+
+        // Collect variant pack modifiers when lowering to backend.
+        // The collected map is used everytime when execute is called.
+        CHECK_CUDNN_FRONTEND_ERROR(collect_variant_pack_replacements_subtree(variant_pack_replacements));
+
+        fe_workspace_size = get_fe_workspace_size_subtree();
+
+        // Cache pass_by_value tensors and workspace modifications for fast execution.
+        // These are collected once here and reused in every execute() call to avoid
+        // repeated tree traversal and map allocation overhead.
+        CHECK_CUDNN_FRONTEND_ERROR(collect_pass_by_value_tensors_subtree(cached_pass_by_value));
+        {
+            int64_t temp_offset = 0;
+            CHECK_CUDNN_FRONTEND_ERROR(
+                collect_tensors_in_workspace_subtree(cached_workspace_modifications, temp_offset));
+        }
+
+        CUDNN_FE_LOG_BANNER("  4/4 LOWERING TO BACKEND OPERATION GRAPH  ");
+
+        // The method here fuses all operations. There will be 1 operation graph in total.
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_operation_graph(handle));
+
+        if (context.get_dynamic_shape_enabled() && kernel_cache && !kernel_cache->is_finalized()) {
+            CUDNN_FE_LOG_BANNER("  BUILD KERNEL CACHE  ");
+            CHECK_CUDNN_FRONTEND_ERROR(kernel_cache->build(operation_graph->get_raw_desc()));
+        }
+
+        CUDNN_FE_LOG_BANNER("  BUILD OP GRAPH ALL OK === ");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    get_plan_name(std::string &name) const {
+        return get_plan_name_at_index(plans.candidate, name);
+    }
+
+    error_t
+    get_plan_name_at_index(int64_t plan_index, std::string &name) const {
+        auto ret_val = plans.get_name_at_index(plan_index, name);
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: get_plan_name_at_index(" << plan_index << ") is " + name);
+        return ret_val;
+    }
+
+    error_t
+    get_workspace_size(int64_t &cudnn_workspace_size) const {
+        return get_workspace_size_plan_at_index(plans.candidate, cudnn_workspace_size);
+    }
+
+    error_t
+    get_workspace_size_plan_at_index(int64_t plan_index, int64_t &cudnn_workspace_size) const {
+        // There are two workspaces:
+        // - cudnn execution plan workspace
+        // - FE node workspace (example: alibiSlope for fmha)
+        int64_t cudnn_ws = 0;
+        CHECK_CUDNN_FRONTEND_ERROR(get_cudnn_workspace_size_node(plan_index, cudnn_ws));
+        cudnn_workspace_size = cudnn_ws + fe_workspace_size;
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: get_workspace_size() is " << cudnn_workspace_size);
+        return {error_code_t::OK, ""};
+    }
+
+    int64_t
+    get_workspace_size() const {
+        return get_workspace_size_plan_at_index(plans.candidate);
+    }
+
+    int64_t
+    get_workspace_size_plan_at_index(int64_t plan_index) const {
+        int64_t cudnn_workspace = 0;
+        auto status             = get_workspace_size_plan_at_index(plan_index, cudnn_workspace);
+        if (status.is_bad()) {
+            CUDNN_FE_LOG_LABEL_ENDL("ERROR: Querying workspace failed.");
+        }
+        return cudnn_workspace;
+    }
+
+    int64_t
+    get_autotune_workspace_size() const {
+        // There are two workspaces:
+        // - cudnn execution plan workspace
+        // - FE node workspace (example: alibiSlope for fmha)
+        return fe_workspace_size + get_max_cudnn_workspace_size();
+    }
+
+    error_t
+    autotune(cudnnHandle_t handle,
+             std::unordered_map<int64_t, void *> &tensor_uid_to_pointer_map,
+             void *workspace,
+             void *user_impl = nullptr) {
+        // Add pass_by_value data pointers to tensor_uid_to_pointer map.
+        // Using cached values to avoid repeated tree traversal overhead.
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, cached_pass_by_value));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            make_variant_pack_replacements(tensor_uid_to_pointer_map, variant_pack_replacements));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, cached_workspace_modifications));
+
+        CHECK_CUDNN_FRONTEND_ERROR(extend_tensor_map_with_workspace_tensors_(
+            tensor_uid_to_pointer_map, workspace, cached_workspace_modifications));
+
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+
+        CHECK_CUDNN_FRONTEND_ERROR(plans.autotune(handle, tensor_uid_to_pointer_map, cudnn_workspace, user_impl));
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    autotune(cudnnHandle_t handle,
+             std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> &tensor_to_pointer_map,
+             void *workspace,
+             void *user_impl = nullptr) {
+        // First get all the uids from the map
+        std::unordered_map<int64_t, void *> tensor_uid_to_pointer_map;
+        for (auto const &[tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return autotune(handle, tensor_uid_to_pointer_map, workspace, user_impl);
+    }
+
+    error_t
+    execute_plan_at_index(cudnnHandle_t handle,
+                          std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> &tensor_to_pointer_map,
+                          void *workspace,
+                          int64_t plan_index) const {
+        CUDNN_FE_LOG_BANNER(" EXECUTE PLAN AT INDEX  for plan index (with Tensor keys) " << plan_index << "  ");
+        // First get all the uids from the map
+        std::unordered_map<int64_t, void *> tensor_uid_to_pointer_map;
+        for (auto const &[tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return execute_plan_at_index(handle, tensor_uid_to_pointer_map, workspace, plan_index);
+    }
+
+    error_t
+    execute(cudnnHandle_t handle,
+            std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> &tensor_to_pointer_map,
+            void *workspace) const {
+        CUDNN_FE_LOG_BANNER(" EXECUTE PLAN (with Tensor keys) ");
+
+        // First get all the uids from the map
+        std::unordered_map<int64_t, void *> tensor_uid_to_pointer_map;
+        for (auto const &[tensor, pointer] : tensor_to_pointer_map) {
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), pointer);
+        }
+
+        return execute(handle, tensor_uid_to_pointer_map, workspace);
+    }
+    error_t
+    execute_plan_at_index(cudnnHandle_t handle,
+                          std::unordered_map<int64_t, void *> &tensor_uid_to_pointer_map,
+                          void *workspace,
+                          int64_t plan_index,
+                          std::vector<int64_t> const &override_uids,
+                          std::vector<std::vector<int64_t>> const &override_shapes,
+                          std::vector<std::vector<int64_t>> const &override_strides) const {
+        // Add pass_by_value data pointers to uid_to_pointer map.
+        // Using cached values to avoid repeated tree traversal overhead.
+        // Object lifetime is controlled by cached_pass_by_value which persists for the Graph's lifetime.
+        CUDNN_FE_LOG_BANNER("  EXECUTE PLAN AT INDEX  for plan index " << plan_index << "  ");
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, cached_pass_by_value));
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            make_variant_pack_replacements(tensor_uid_to_pointer_map, variant_pack_replacements));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, cached_workspace_modifications));
+
+        CHECK_CUDNN_FRONTEND_ERROR(extend_tensor_map_with_workspace_tensors_(
+            tensor_uid_to_pointer_map, workspace, cached_workspace_modifications));
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+
+        if (isLoggingEnabled()) {
+            cudaStream_t stream;
+            _CUDNN_CHECK_CUDNN_ERROR(detail::get_stream(handle, &stream));
+            for (auto const &[uid, ptr] : tensor_uid_to_pointer_map) {
+                CHECK_CUDNN_FRONTEND_ERROR(detail::log_variant_pack_memory_type(uid, ptr));
+            }
+            for (auto const &[tensor, fmt] : tensors_to_dump) {
+                auto it = tensor_uid_to_pointer_map.find(tensor->get_uid());
+                if (it != tensor_uid_to_pointer_map.end()) {
+                    auto const &dims    = tensor->get_dim();
+                    size_t num_elements = 1;
+                    for (auto d : dims) num_elements *= static_cast<size_t>(d);
+                    size_t elem_size = detail::get_data_type_size(tensor->get_data_type());
+                    CHECK_CUDNN_FRONTEND_ERROR(detail::log_dump_tensor_content(
+                        it->first, tensor->get_name(), it->second, num_elements, elem_size, fmt, stream));
+                }
+            }
+        }
+
+        CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plan_with_uid(handle,
+                                                               tensor_uid_to_pointer_map,
+                                                               cudnn_workspace,
+                                                               plan_index,
+                                                               override_uids,
+                                                               override_shapes,
+                                                               override_strides));
+
+        CUDNN_FE_LOG_BANNER("  EXECUTE PLAN AT INDEX  ALL OK for plan index " << plan_index << "  ");
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    execute(cudnnHandle_t handle,
+            std::unordered_map<int64_t, void *> &tensor_uid_to_pointer_map,
+            void *workspace,
+            std::vector<int64_t> const &override_uids,
+            std::vector<std::vector<int64_t>> const &override_shapes,
+            std::vector<std::vector<int64_t>> const &override_strides) const {
+        // Add pass_by_value data pointers to uid_to_pointer map.
+        // Using cached values to avoid repeated tree traversal overhead.
+        CUDNN_FE_LOG_BANNER(" EXECUTE PLAN  ");
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, cached_pass_by_value));
+        CHECK_CUDNN_FRONTEND_ERROR(
+            make_variant_pack_replacements(tensor_uid_to_pointer_map, variant_pack_replacements));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, cached_workspace_modifications));
+
+        CHECK_CUDNN_FRONTEND_ERROR(extend_tensor_map_with_workspace_tensors_(
+            tensor_uid_to_pointer_map, workspace, cached_workspace_modifications));
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+
+        if (isLoggingEnabled()) {
+            cudaStream_t stream;
+            _CUDNN_CHECK_CUDNN_ERROR(detail::get_stream(handle, &stream));
+            for (auto const &[uid, ptr] : tensor_uid_to_pointer_map) {
+                CHECK_CUDNN_FRONTEND_ERROR(detail::log_variant_pack_memory_type(uid, ptr));
+            }
+            for (auto const &[tensor, fmt] : tensors_to_dump) {
+                auto it = tensor_uid_to_pointer_map.find(tensor->get_uid());
+                if (it != tensor_uid_to_pointer_map.end()) {
+                    auto const &dims    = tensor->get_dim();
+                    size_t num_elements = 1;
+                    for (auto d : dims) num_elements *= static_cast<size_t>(d);
+                    size_t elem_size = detail::get_data_type_size(tensor->get_data_type());
+                    CHECK_CUDNN_FRONTEND_ERROR(detail::log_dump_tensor_content(
+                        it->first, tensor->get_name(), it->second, num_elements, elem_size, fmt, stream));
+                }
+            }
+        }
+
+        CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plan_with_uid(handle,
+                                                               tensor_uid_to_pointer_map,
+                                                               cudnn_workspace,
+                                                               plans.candidate,
+                                                               override_uids,
+                                                               override_shapes,
+                                                               override_strides));
+
+        CUDNN_FE_LOG_BANNER(" EXECUTE PLAN  ALL OK ");
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    execute_plan_at_index(cudnnHandle_t handle,
+                          std::unordered_map<int64_t, void *> &tensor_uid_to_pointer_map,
+                          void *workspace,
+                          int64_t plan_index) const {
+        // Add pass_by_value data pointers to uid_to_pointer map
+        // object lifetime is controlled by tensor_to_pass_by_value which means the pointer should stay valid during
+        // execute.
+        CHECK_CUDNN_FRONTEND_ERROR(
+            execute_plan_at_index(handle, tensor_uid_to_pointer_map, workspace, plan_index, {}, {}, {}));
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    execute(cudnnHandle_t handle,
+            std::unordered_map<int64_t, void *> &tensor_uid_to_pointer_map,
+            void *workspace) const {
+        // Add pass_by_value data pointers to uid_to_pointer map.
+        // Using cached values to avoid repeated tree traversal overhead.
+        CUDNN_FE_LOG_BANNER(" EXECUTE PLAN  ");
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, cached_pass_by_value));
+        CHECK_CUDNN_FRONTEND_ERROR(
+            make_variant_pack_replacements(tensor_uid_to_pointer_map, variant_pack_replacements));
+
+        CHECK_CUDNN_FRONTEND_ERROR(run_auxiliary_kernels(handle, workspace, cached_workspace_modifications));
+        
+        //逻辑注册   把刚才 Workspace 里那些新变量的地址正式告诉执行器。
+        CHECK_CUDNN_FRONTEND_ERROR(extend_tensor_map_with_workspace_tensors_(
+            tensor_uid_to_pointer_map, workspace, cached_workspace_modifications));
+        // offset workspace by the already used fe graph workspace
+        // this is where cudnn backend can start using workspace for its execution plans
+        void *cudnn_workspace = static_cast<char *>(workspace) + fe_workspace_size;
+
+        CHECK_CUDNN_FRONTEND_ERROR(execute_cudnn_plan_with_uid(
+            handle, tensor_uid_to_pointer_map, cudnn_workspace, plans.candidate, {}, {}, {}));
+
+        CUDNN_FE_LOG_BANNER(" EXECUTE PLAN  ALL OK ");
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    warmup(cudnnHandle_t handle) {
+        cudaStream_t fake_stream;
+
+        cudaStream_t original_stream;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::get_stream(handle, &original_stream));
+
+        CUDNN_FE_LOG_BANNER("WARMUP (BEGIN FAKE GRAPH CAPTURE) ");
+
+        if (original_stream == nullptr) {
+            _CUDNN_CHECK_CUDA_ERROR(detail::cuda_stream_create(&fake_stream));
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_stream(handle, fake_stream));
+        } else {
+            fake_stream = original_stream;
+        }
+
+        cudaGraph_t graph_obj;
+
+        cudaStreamCaptureStatus capture_status;
+
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_stream_is_capturing(fake_stream, &capture_status));
+
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: capture_status "
+                                << capture_status << " original_stream "
+                                << ((original_stream == nullptr) ? "DEFAULT (NULL) Stream" : "NON-DEFAULT Stream"));
+
+        if (capture_status != cudaStreamCaptureStatusNone) {
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: cuda graph capture active, aborting warmup");
+            return {error_code_t::OK, "cuda graph capture active, aborting warmup"};
+        }
+
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_begin_capture(fake_stream, cudaStreamCaptureModeRelaxed));
+
+        std::unordered_map<int64_t, void *> tensor_uid_to_pointer_map;
+
+        void *tmp_pointer = reinterpret_cast<void *>(0x7f0000000000llu);
+
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_malloc((void **)&tmp_pointer, 1024 * 1024));
+
+        float tmp_double  = 1.0f;
+        void *cpu_pointer = reinterpret_cast<void *>(&tmp_double);
+
+        for (auto const &tensor : deserialized_tensor_properties) {
+            if (tensor->get_is_virtual() == false) {
+                if (tensor->get_is_pass_by_value() == false) {
+                    tensor_uid_to_pointer_map.emplace(tensor->get_uid(), tmp_pointer);
+                } else {
+                    tensor_uid_to_pointer_map.emplace(tensor->get_uid(), cpu_pointer);
+                }
+            }
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: full_graph_inputs: " << full_graph_inputs.size() << " elements");
+        for (auto const &tensor : full_graph_inputs) {
+            CUDNN_FE_LOG_LABEL_ENDL("\tuid: " << tensor->get_uid()
+                                              << ", is_pass_by_value = " << tensor->get_is_pass_by_value());
+            if (tensor->get_is_pass_by_value() == false) {
+                tensor_uid_to_pointer_map.emplace(tensor->get_uid(), tmp_pointer);
+            } else {
+                tensor_uid_to_pointer_map.emplace(tensor->get_uid(), cpu_pointer);
+            }
+        }
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: full_graph_outputs: " << full_graph_outputs.size() << " elements");
+        for (auto const &tensor : full_graph_outputs) {
+            CUDNN_FE_LOG_LABEL_ENDL("\tuid: " << tensor->get_uid());
+            tensor_uid_to_pointer_map.emplace(tensor->get_uid(), tmp_pointer);
+        }
+
+        CHECK_CUDNN_FRONTEND_ERROR(
+            extend_tensor_map_with_pass_by_value_tensors_(tensor_uid_to_pointer_map, deserialized_pass_by_value));
+
+        auto cudnn_status = execute(handle, tensor_uid_to_pointer_map, tmp_pointer);
+        (void)cudnn_status;  // No need to check bad executes
+
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_end_capture(fake_stream, &graph_obj));
+
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_graph_destroy(graph_obj));
+
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_free(tmp_pointer));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_stream(handle, original_stream));
+
+        if (original_stream == nullptr) {
+            _CUDNN_CHECK_CUDA_ERROR(detail::cuda_stream_destroy(fake_stream));
+        }
+
+        CUDNN_FE_LOG_BANNER("WARMUP (END FAKE GRAPH CAPTURE) ");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    serialize(std::vector<uint8_t> &data) const {
+        CUDNN_FE_LOG_BANNER(" SERIALIZE PLAN  ");
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        json j;
+        serialize(j);
+
+        auto const candidate = plans.candidate;
+        auto execution_plan  = plans.execution_plans[candidate];
+        if (execution_plan != nullptr) {
+            auto serialized_plan    = execution_plan->getJsonRepresentation();
+            j["cudnn_backend_data"] = serialized_plan;
+            j["variant_pack_uids"]  = variant_pack_uids;
+        }
+
+        j["behavior_notes"] = plans.behavior_notes;
+
+        std::unordered_map<uid_t, pass_by_values_t> tensor_to_pass_by_value;
+        CHECK_CUDNN_FRONTEND_ERROR(collect_pass_by_value_tensors_subtree(tensor_to_pass_by_value));
+        j["pass_by_values"] = tensor_to_pass_by_value;
+
+        std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> workspace_modifications;
+        int64_t workspace_offset = 0;
+        CHECK_CUDNN_FRONTEND_ERROR(collect_tensors_in_workspace_subtree(workspace_modifications, workspace_offset));
+        j["workspace_modifications"] = workspace_modifications;
+
+        j["variant_pack_replacements"] = variant_pack_replacements;
+
+        j["fe_workspace_size"] = fe_workspace_size;
+
+        std::vector<std::pair<uid_t, char>> tensors_to_dump_uids;
+        for (auto const &[tensor, fmt] : tensors_to_dump) {
+            tensors_to_dump_uids.emplace_back(tensor->get_uid(), fmt);
+        }
+        j["tensors_to_dump"] = tensors_to_dump_uids;
+
+        data = json::to_ubjson(j);
+        CUDNN_FE_LOG_BANNER(" SERIALIZE PLAN (ALL OK) ");
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(data);
+        return {error_code_t::GRAPH_NOT_SUPPORTED, "unavailable when compiled with CUDNN_FRONTEND_SKIP_JSON_LIB"};
+#endif
+    }
+
+    error_t
+    deserialize(cudnnHandle_t handle, std::vector<uint8_t> const &data) {
+        CUDNN_FE_LOG_BANNER(" DESERIALIZE PLAN WITH HANDLE  ");
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        json j = json::from_ubjson(data);
+
+        if (j.contains("tensors")) {
+            auto tensor_map = j["tensors"].get<std::unordered_map<std::string, json>>();
+            for (const auto &tensor_info : tensor_map) {
+                auto tensor_attributes = std::make_shared<Tensor_attributes>();
+                from_json(tensor_info.second, *tensor_attributes);
+                deserialized_tensor_properties.insert(tensor_attributes);
+            }
+        }
+
+        auto serialized_plan = j["cudnn_backend_data"];
+
+        CHECK_CUDNN_FRONTEND_ERROR(plans.build_plans(handle, serialized_plan));
+
+        plans.behavior_notes = j["behavior_notes"].get<std::vector<std::vector<BehaviorNote_t>>>();
+
+        variant_pack_uids = j["variant_pack_uids"].get<std::unordered_set<graph::Tensor_attributes::uid_t>>();
+
+        deserialized_pass_by_value = j["pass_by_values"];
+
+        deserialized_workspace_modifications = j["workspace_modifications"];
+
+        variant_pack_replacements = j["variant_pack_replacements"];
+
+        fe_workspace_size = j["fe_workspace_size"];
+
+        // Initialize the execution caches from deserialized data
+        cached_pass_by_value           = deserialized_pass_by_value;
+        cached_workspace_modifications = deserialized_workspace_modifications;
+
+        if (j.contains("tensors_to_dump")) {
+            auto dump_uids = j["tensors_to_dump"].get<std::vector<std::pair<uid_t, char>>>();
+            for (auto const &[uid, fmt] : dump_uids) {
+                for (auto const &tensor : deserialized_tensor_properties) {
+                    if (tensor->get_uid() == uid) {
+                        tensors_to_dump.emplace_back(tensor, fmt);
+                        break;
+                    }
+                }
+            }
+        }
+
+        CHECK_CUDNN_FRONTEND_ERROR(warmup(handle));
+
+        CUDNN_FE_LOG_BANNER(" DESERIALIZE PLAN WITH HANDLE (ALL OK) ");
+
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(handle);
+        CUDNN_FRONTEND_UNUSED(data);
+        return {error_code_t::GRAPH_NOT_SUPPORTED, "unavailable when compiled with CUDNN_FRONTEND_SKIP_JSON_LIB"};
+#endif
+    }
+
+    Type
+    getType() override {
+        return Type::COMPOSITE;
+    }
+
+    Graph &
+    set_intermediate_data_type(DataType_t type);
+    Graph &
+    set_io_data_type(DataType_t type);
+    Graph &
+    set_compute_data_type(DataType_t type);
+    Graph &
+    set_dynamic_shape_enabled(bool is_enabled);
+    Graph &
+    set_sm_count(int32_t type);
+    Graph &
+    set_sm_version(int32_t version);
+    Graph &
+    set_kernel_cache(std::shared_ptr<KernelCache> cache);
+    Graph &
+    set_device_properties(std::shared_ptr<const DeviceProperties> device_prop);
+
+    Graph &
+    set_name(std::string const &name) {
+        context.set_name(name);
+        return *this;
+    }
+
+    error_t
+    query_tensor_attributes_of_uid(int64_t const uid, Tensor_attributes &tensor) const;
+
+    std::shared_ptr<Tensor_attributes>
+    tensor(Tensor_attributes const &tensor);
+
+    std::shared_ptr<Tensor_attributes>
+    tensor_like(std::shared_ptr<Tensor_attributes> const &tensor, std::string const &name = std::string{});
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> layernorm(std::shared_ptr<Tensor_attributes>,
+                                                                std::shared_ptr<Tensor_attributes>,
+                                                                std::shared_ptr<Tensor_attributes>,
+                                                                Layernorm_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> adalayernorm(std::shared_ptr<Tensor_attributes>,
+                                                                   std::shared_ptr<Tensor_attributes>,
+                                                                   std::shared_ptr<Tensor_attributes>,
+                                                                   AdaLayernorm_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> instancenorm(std::shared_ptr<Tensor_attributes>,
+                                                                   std::shared_ptr<Tensor_attributes>,
+                                                                   std::shared_ptr<Tensor_attributes>,
+                                                                   Instancenorm_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 5> batchnorm(std::shared_ptr<Tensor_attributes>,
+                                                                std::shared_ptr<Tensor_attributes>,
+                                                                std::shared_ptr<Tensor_attributes>,
+                                                                Batchnorm_attributes);
+
+    std::shared_ptr<Tensor_attributes> batchnorm_inference(std::shared_ptr<Tensor_attributes>,
+                                                           std::shared_ptr<Tensor_attributes>,
+                                                           std::shared_ptr<Tensor_attributes>,
+                                                           std::shared_ptr<Tensor_attributes>,
+                                                           std::shared_ptr<Tensor_attributes>,
+                                                           Batchnorm_inference_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 6> bn_finalize(std::shared_ptr<Tensor_attributes>,
+                                                                  std::shared_ptr<Tensor_attributes>,
+                                                                  std::shared_ptr<Tensor_attributes>,
+                                                                  std::shared_ptr<Tensor_attributes>,
+                                                                  std::shared_ptr<Tensor_attributes>,
+                                                                  std::shared_ptr<Tensor_attributes>,
+                                                                  BN_finalize_attributes);
+
+    std::shared_ptr<Tensor_attributes> conv_fprop(std::shared_ptr<Tensor_attributes>,
+                                                  std::shared_ptr<Tensor_attributes>,
+                                                  Conv_fprop_attributes);
+
+    std::shared_ptr<Tensor_attributes> conv_dgrad(std::shared_ptr<Tensor_attributes>,
+                                                  std::shared_ptr<Tensor_attributes>,
+                                                  Conv_dgrad_attributes);
+
+    std::shared_ptr<Tensor_attributes> conv_wgrad(std::shared_ptr<Tensor_attributes>,
+                                                  std::shared_ptr<Tensor_attributes>,
+                                                  Conv_wgrad_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 5> dbn_weight(std::shared_ptr<Tensor_attributes>,
+                                                                 std::shared_ptr<Tensor_attributes>,
+                                                                 std::shared_ptr<Tensor_attributes>,
+                                                                 std::shared_ptr<Tensor_attributes>,
+                                                                 std::shared_ptr<Tensor_attributes>,
+                                                                 DBN_weight_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> batchnorm_backward(std::shared_ptr<Tensor_attributes>,
+                                                                         std::shared_ptr<Tensor_attributes>,
+                                                                         std::shared_ptr<Tensor_attributes>,
+                                                                         Batchnorm_backward_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> layernorm_backward(std::shared_ptr<Tensor_attributes>,
+                                                                         std::shared_ptr<Tensor_attributes>,
+                                                                         std::shared_ptr<Tensor_attributes>,
+                                                                         Layernorm_backward_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> adalayernorm_backward(std::shared_ptr<Tensor_attributes>,
+                                                                            std::shared_ptr<Tensor_attributes>,
+                                                                            std::shared_ptr<Tensor_attributes>,
+                                                                            AdaLayernorm_backward_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> instancenorm_backward(std::shared_ptr<Tensor_attributes>,
+                                                                            std::shared_ptr<Tensor_attributes>,
+                                                                            std::shared_ptr<Tensor_attributes>,
+                                                                            Instancenorm_backward_attributes);
+    std::array<std::shared_ptr<Tensor_attributes>, 2> genstats(std::shared_ptr<Tensor_attributes>, Genstats_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 2> rmsnorm(std::shared_ptr<Tensor_attributes>,
+                                                              std::shared_ptr<Tensor_attributes>,
+                                                              Rmsnorm_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> rmsnorm_backward(std::shared_ptr<Tensor_attributes>,
+                                                                       std::shared_ptr<Tensor_attributes>,
+                                                                       std::shared_ptr<Tensor_attributes>,
+                                                                       std::shared_ptr<Tensor_attributes>,
+                                                                       Rmsnorm_backward_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 2> sdpa(std::shared_ptr<Tensor_attributes>,
+                                                           std::shared_ptr<Tensor_attributes>,
+                                                           std::shared_ptr<Tensor_attributes>,
+                                                           SDPA_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 4> sdpa_fp8(std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               std::shared_ptr<Tensor_attributes>,
+                                                               SDPA_fp8_attributes);
+
+    inline std::array<std::shared_ptr<Tensor_attributes>, 7> sdpa_fp8_backward(std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               std::shared_ptr<Tensor_attributes>,
+                                                                               SDPA_fp8_backward_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 3> sdpa_backward(std::shared_ptr<Tensor_attributes>,
+                                                                    std::shared_ptr<Tensor_attributes>,
+                                                                    std::shared_ptr<Tensor_attributes>,
+                                                                    std::shared_ptr<Tensor_attributes>,
+                                                                    std::shared_ptr<Tensor_attributes>,
+                                                                    std::shared_ptr<Tensor_attributes>,
+                                                                    SDPA_backward_attributes);
+
+    std::shared_ptr<Tensor_attributes> slice(std::shared_ptr<Tensor_attributes>, Slice_attributes);
+
+    std::array<std::shared_ptr<Tensor_attributes>, 2> block_scale_quantize(std::shared_ptr<Tensor_attributes>,
+                                                                           Block_scale_quantize_attributes);
+
+    std::shared_ptr<Tensor_attributes> block_scale_dequantize(std::shared_ptr<Tensor_attributes>,
+                                                              std::shared_ptr<Tensor_attributes>,
+                                                              Block_scale_dequantize_attributes);
+
+    std::shared_ptr<Tensor_attributes> concatenate(std::vector<std::shared_ptr<Tensor_attributes>>,
+                                                   Concatenate_attributes);
+
+    std::shared_ptr<Tensor_attributes> moe_grouped_matmul(std::shared_ptr<Tensor_attributes>,
+                                                          std::shared_ptr<Tensor_attributes>,
+                                                          std::shared_ptr<Tensor_attributes>,
+                                                          std::shared_ptr<Tensor_attributes>,
+                                                          std::shared_ptr<Tensor_attributes>,
+                                                          Moe_grouped_matmul_attributes);
+
+    [[deprecated]] std::array<std::shared_ptr<Tensor_attributes>, 2>
+    scaled_dot_product_flash_attention(std::shared_ptr<Tensor_attributes> q,
+                                       std::shared_ptr<Tensor_attributes> k,
+                                       std::shared_ptr<Tensor_attributes> v,
+                                       SDPA_attributes attributes) {
+        return sdpa(q, k, v, attributes);
+    }
+    [[deprecated]] std::array<std::shared_ptr<Tensor_attributes>, 3>
+    scaled_dot_product_flash_attention_backward(std::shared_ptr<Tensor_attributes> q,
+                                                std::shared_ptr<Tensor_attributes> k,
+                                                std::shared_ptr<Tensor_attributes> v,
+                                                std::shared_ptr<Tensor_attributes> o,
+                                                std::shared_ptr<Tensor_attributes> dO,
+                                                std::shared_ptr<Tensor_attributes> stats,
+                                                SDPA_backward_attributes attributes) {
+        return sdpa_backward(q, k, v, o, dO, stats, attributes);
+    }
+
+    error_t
+    create_execution_plans(std::vector<HeurMode_t> const &mode);
+
+    error_t
+    create_execution_plan(int64_t const engine_id, std::unordered_map<KnobType_t, int64_t> const &knobs);
+
+    int64_t
+    get_execution_plan_count() const;
+
+    inline error_t
+    get_engine_count(int64_t &count);
+
+    inline error_t
+    get_knobs_for_engine(int64_t const engine, std::vector<Knob> &);
+
+    error_t
+    check_support(cudnnHandle_t h) {
+        // handle not required anymore
+        // TODO: remove this function in next release
+        (void)h;
+        return check_support();
+    }
+
+    // overload for deviceless AoT compilation
+    error_t
+    check_support() {
+        CHECK_CUDNN_FRONTEND_ERROR(plans.check_support());
+        return {error_code_t::OK, ""};
+    }
+
+    // TODO: remove this function in next release
+    error_t
+    build(cudnnHandle_t const &handle,
+          std::vector<HeurMode_t> const &mode,
+          BuildPlanPolicy_t const policy     = BuildPlanPolicy_t::HEURISTICS_CHOICE,
+          bool const do_multithreaded_builds = false);
+
+    // overload for deviceless AoT compilation
+    error_t
+    build(std::vector<HeurMode_t> const &mode,
+          BuildPlanPolicy_t const policy     = BuildPlanPolicy_t::HEURISTICS_CHOICE,
+          bool const do_multithreaded_builds = false);
+
+    error_t
+    build_plans(cudnnHandle_t const &handle,
+                BuildPlanPolicy_t const policy     = BuildPlanPolicy_t::HEURISTICS_CHOICE,
+                bool const do_multithreaded_builds = false) {
+        // handle not required anymore
+        // TODO: remove this function in next release
+        (void)handle;
+        return build_plans(policy, do_multithreaded_builds);
+    }
+
+    // overload for deviceless AoT compilation
+    error_t
+    build_plans(BuildPlanPolicy_t const policy     = BuildPlanPolicy_t::HEURISTICS_CHOICE,
+                bool const do_multithreaded_builds = false);
+
+    error_t
+    build_plan_at_index(cudnnHandle_t const &handle, int64_t index) {
+        // handle not required anymore
+        // TODO: remove this function in next release
+        (void)handle;
+        return build_plan_at_index(index);
+    }
+
+    // overload for deviceless AoT compilation
+    error_t
+    build_plan_at_index(int64_t index);
+
+    Graph &
+    deselect_workspace_greater_than(int64_t const workspace) {
+        plans.set_max_workspace_allowed(workspace);
+        return *this;
+    }
+
+    Graph &
+    deselect_shared_mem_greater_than(int64_t const workspace) {
+        plans.set_max_shared_mem_allowed(workspace);
+        return *this;
+    }
+
+    Graph &
+    deselect_engines(std::vector<std::string> const &engine_names) {
+        plans.set_barred_names(engine_names);
+        return *this;
+    }
+
+    Graph &
+    select_behavior_notes(std::vector<BehaviorNote_t> const &notes) {
+        auto status = plans.filter_behavior_notes(notes, true);
+        if (status.is_bad()) {
+            CUDNN_FE_LOG(status.get_message() << std::endl);
+        }
+        return *this;
+    }
+
+    Graph &
+    select_numeric_notes(std::vector<NumericalNote_t> const &notes) {
+        auto status = plans.filter_numeric_notes(notes, true);
+        if (status.is_bad()) {
+            CUDNN_FE_LOG(status.get_message() << std::endl);
+        }
+        return *this;
+    }
+
+    Graph &
+    deselect_behavior_notes(std::vector<BehaviorNote_t> const &notes) {
+        auto status = plans.filter_behavior_notes(notes, false);
+        if (status.is_bad()) {
+            CUDNN_FE_LOG(status.get_message() << std::endl);
+        }
+        return *this;
+    }
+
+    Graph &
+    deselect_numeric_notes(std::vector<NumericalNote_t> const &notes) {
+        auto status = plans.filter_numeric_notes(notes, false);
+        if (status.is_bad()) {
+            CUDNN_FE_LOG(status.get_message() << std::endl);
+        }
+        return *this;
+    }
+
+    error_t
+    get_behavior_notes_for_plan_at_index(int64_t const index, std::vector<BehaviorNote_t> &notes) const;
+
+    error_t
+    get_behavior_notes(std::vector<BehaviorNote_t> &notes) const;
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json &j) const override final {
+        // Different from serialization of other INodes.
+        // Go over each subnode and serialize them.
+        json full_json;
+
+        full_json["context"]["name"]                     = context.get_name();
+        full_json["context"]["compute_data_type"]        = context.get_compute_data_type();
+        full_json["context"]["intermediate_data_type"]   = context.get_intermediate_data_type();
+        full_json["context"]["io_data_type"]             = context.get_io_data_type();
+        full_json["context"]["sm_count"]                 = context.get_target_sm_count();
+        full_json["context"]["is_dynamic_shape_enabled"] = context.get_dynamic_shape_enabled();
+
+        full_json.update(R"( {"tag": "GRAPH"})"_json);
+        full_json["nodes"];
+        for (auto const &sub_node : sub_nodes) {
+            json j_sub_node;
+            sub_node->serialize(j_sub_node);
+            full_json["nodes"].push_back(j_sub_node);
+        }
+
+        j["context"] = full_json["context"];
+
+        j["json_version"]           = "1.0";
+        j["cudnn_backend_version"]  = detail::get_backend_version_string();
+        j["cudnn_frontend_version"] = CUDNN_FRONTEND_VERSION;
+        j["nodes"];
+        j["tensors"];
+        std::unordered_set<std::string> tensors;
+        for (const auto &sub_node : full_json["nodes"]) {
+            // Create a short version of the node
+            auto short_node       = sub_node;
+            short_node["inputs"]  = {};
+            short_node["outputs"] = {};
+
+            auto node_name = sub_node["tag"].get<std::string>();
+            auto i         = 0;
+            // Process node inputs
+            for (const auto &input : sub_node["inputs"]) {
+                std::string port_name;
+                json tensor_info;
+
+                if (node_name == "CONCATENATE") {
+                    // Extract port_name and tensor_name
+                    port_name   = std::to_string(i);
+                    tensor_info = input;
+                    i++;
+                } else {
+                    // Extract port_name and tensor_name
+                    port_name   = input[0].get<std::string>();
+                    tensor_info = input[1];
+                }
+
+                if (tensor_info.is_null()) {
+                    continue;
+                }
+
+                std::string tensor_name = tensor_info["name"].get<std::string>();
+                // Update short_node inputs
+                short_node["inputs"][port_name] = tensor_name;
+
+                // Check if the tensor is already in the tensors map
+                if (tensors.find(tensor_name) == tensors.end()) {
+                    // If not, add it to the j["tensors"]
+                    j["tensors"][tensor_name] = tensor_info;
+                }
+            }
+
+            // Process node outputs
+            for (const auto &output : sub_node["outputs"]) {
+                // Extract port_name and tensor_name
+                auto port_name   = output[0].get<std::string>();
+                auto tensor_info = output[1];
+
+                if (tensor_info.is_null()) {
+                    continue;
+                }
+
+                std::string tensor_name = tensor_info["name"].get<std::string>();
+
+                // Update short_node outputs
+                short_node["outputs"][port_name] = tensor_name;
+
+                // Check if the tensor is already in the tensors map
+                if (tensors.find(tensor_name) == tensors.end()) {
+                    // If not, add it to the j["tensors"]
+                    j["tensors"][tensor_name] = tensor_info;
+                }
+            }
+
+            // Add the short_node to j["nodes"]
+            j["nodes"].push_back(short_node);
+        }
+    };
+#endif
+
+    size_t
+    key() override final {
+        return key(context.get_dynamic_shape_enabled());
+    }
+
+    // TODO: temparorily placed in graphs class. This function needs to be a free standing function.
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    error_t
+    deserialize(const json &j) {
+        if (j.contains("context")) {
+            const auto &j_context = j["context"];
+            if (j_context.contains("compute_data_type") && !j_context["compute_data_type"].is_null()) {
+                context.set_compute_data_type(j_context["compute_data_type"].get<DataType_t>());
+            }
+            if (j_context.contains("intermediate_data_type") && !j_context["intermediate_data_type"].is_null()) {
+                context.set_intermediate_data_type(j_context["intermediate_data_type"].get<DataType_t>());
+            }
+            if (j_context.contains("io_data_type") && !j_context["io_data_type"].is_null()) {
+                context.set_io_data_type(j_context["io_data_type"].get<DataType_t>());
+            }
+            if (j_context.contains("name") && !j_context["name"].is_null()) {
+                context.set_name(j_context["name"].get<std::string>());
+            }
+            if (j_context.contains("sm_count") && !j_context["sm_count"].is_null()) {
+                context.set_target_sm_count(j_context["sm_count"].get<int32_t>());
+            }
+            if (j_context.contains("is_dynamic_shape_enabled") && !j_context["is_dynamic_shape_enabled"].is_null()) {
+                context.set_dynamic_shape_enabled(j_context["is_dynamic_shape_enabled"].get<bool>());
+            }
+        }
+
+        std::map<std::string, std::shared_ptr<Tensor_attributes>> created_tensors;
+        // Iterate through each sub-node in the full JSON
+        if (j.contains("nodes") && j["nodes"].is_array()) {
+            for (auto j_sub_node : j["nodes"]) {
+                // Create a JSON object for inputs
+                json inputs;
+
+                // Iterate through each input of the sub-node
+                if (j_sub_node.contains("inputs") && j_sub_node["inputs"].is_object()) {
+                    for (auto &[port_name, tensor_name] : j_sub_node["inputs"].items()) {
+                        if (j.contains("tensors") && j["tensors"].contains(tensor_name)) {
+                            // Add the input to the inputs JSON object
+                            inputs.push_back({port_name, j["tensors"][tensor_name]});
+                        }
+                    }
+                }
+
+                // Create a JSON object for outputs
+                json outputs;
+
+                // Iterate through each output of the sub-node
+                if (j_sub_node.contains("outputs") && j_sub_node["outputs"].is_object()) {
+                    for (auto &[port_name, tensor_name] : j_sub_node["outputs"].items()) {
+                        if (j.contains("tensors") && j["tensors"].contains(tensor_name)) {
+                            // Add the output to the outputs JSON object
+                            outputs.push_back({port_name, j["tensors"][tensor_name]});
+                        }
+                    }
+                }
+
+                // Replace the original inputs and outputs of the sub-node with the new JSON objects
+                j_sub_node["inputs"]  = inputs;
+                j_sub_node["outputs"] = outputs;
+
+                auto check_if_pre_created_tensor = [&created_tensors](std::shared_ptr<Tensor_attributes> t) {
+                    if (t == nullptr) {
+                        return t;
+                    }
+
+                    if (created_tensors.find(t->get_name()) == created_tensors.end()) {
+                        created_tensors.insert({t->get_name(), t});
+                        return t;
+                    } else {
+                        return created_tensors[t->get_name()];
+                    }
+                };
+
+#define CHECK_TENSORS(attributes)                                      \
+    for (const auto &[key, tensor] : attributes.inputs) {              \
+        attributes.inputs[key] = check_if_pre_created_tensor(tensor);  \
+    }                                                                  \
+    for (const auto &[key, tensor] : attributes.outputs) {             \
+        attributes.outputs[key] = check_if_pre_created_tensor(tensor); \
+    }
+
+#define FILL_GLOBAL_IO_TENSOR_MAP(attributes)                              \
+    for (auto input_name_to_attr_pair : attributes.inputs) {               \
+        if (input_name_to_attr_pair.second != nullptr &&                   \
+            (input_name_to_attr_pair.second->get_is_virtual() == false)) { \
+            full_graph_inputs.emplace(input_name_to_attr_pair.second);     \
+        }                                                                  \
+    }                                                                      \
+    for (auto output_name_to_attr_pair : attributes.outputs) {             \
+        if (output_name_to_attr_pair.second != nullptr) {                  \
+            full_graph_outputs.emplace(output_name_to_attr_pair.second);   \
+        }                                                                  \
+    }
+                if (j_sub_node.contains("tag") && j_sub_node["tag"].is_string()) {
+                    auto tag = j_sub_node["tag"].get<std::string>();
+                    if (tag == "CONV_FPROP") {
+                        auto conv_fprop_attributes = j_sub_node.get<Conv_fprop_attributes>();
+                        CHECK_TENSORS(conv_fprop_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(conv_fprop_attributes);
+                        sub_nodes.emplace_back(
+                            std::make_unique<ConvolutionNode>(std::move(conv_fprop_attributes), context));
+                    } else if (tag == "POINTWISE") {
+                        auto pointwise_attributes = j_sub_node.get<Pointwise_attributes>();
+                        CHECK_TENSORS(pointwise_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(pointwise_attributes);
+                        sub_nodes.emplace_back(
+                            std::make_unique<PointwiseNode>(std::move(pointwise_attributes), context));
+                    } else if (tag == "REDUCTION") {
+                        auto reduction_attributes = j_sub_node.get<Reduction_attributes>();
+                        CHECK_TENSORS(reduction_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(reduction_attributes);
+                        sub_nodes.emplace_back(
+                            std::make_unique<ReductionNode>(std::move(reduction_attributes), context));
+                    } else if (tag == "SDPA_FWD") {
+                        auto sdpa_attributes = j_sub_node.get<SDPA_attributes>();
+                        CHECK_TENSORS(sdpa_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(sdpa_attributes);
+                        switch (sdpa_attributes.implementation) {
+                            case AttentionImplementation_t::AUTO:
+                                return {error_code_t::INVALID_VALUE,
+                                        "Implementation cannot be AUTO in serialized form"};
+                            case AttentionImplementation_t::COMPOSITE:
+                                sub_nodes.emplace_back(
+                                    std::make_unique<CompositeSDPANode>(std::move(sdpa_attributes), context));
+                                break;
+                            case AttentionImplementation_t::UNIFIED:
+                                sub_nodes.emplace_back(
+                                    std::make_unique<UnifiedSDPANode>(std::move(sdpa_attributes), context));
+                        }
+                    } else if (tag == "SDPA_BWD") {
+                        auto sdpa_bwd_attributes = j_sub_node.get<SDPA_backward_attributes>();
+                        CHECK_TENSORS(sdpa_bwd_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(sdpa_bwd_attributes);
+                        sub_nodes.emplace_back(
+                            std::make_unique<CompositeSDPABackwardNode>(std::move(sdpa_bwd_attributes), context));
+                    } else if (tag == "MATMUL") {
+                        auto matmul_attributes = j_sub_node.get<Matmul_attributes>();
+                        CHECK_TENSORS(matmul_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(matmul_attributes);
+                        sub_nodes.emplace_back(std::make_unique<MatmulNode>(std::move(matmul_attributes), context));
+                    } else if (tag == "SLICE") {
+                        auto slice_attributes = j_sub_node.get<Slice_attributes>();
+                        CHECK_TENSORS(slice_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(slice_attributes);
+                        sub_nodes.emplace_back(std::make_unique<SliceNode>(std::move(slice_attributes), context));
+                    } else if (tag == "RESAMPLE") {
+                        auto resample_attributes = j_sub_node.get<Resample_attributes>();
+                        CHECK_TENSORS(resample_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(resample_attributes);
+                        sub_nodes.emplace_back(std::make_unique<ResampleNode>(std::move(resample_attributes), context));
+                    } else if (tag == "CONV_DGRAD") {
+                        auto dgrad_attributes = j_sub_node.get<Conv_dgrad_attributes>();
+                        CHECK_TENSORS(dgrad_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(dgrad_attributes);
+                        sub_nodes.emplace_back(std::make_unique<DgradNode>(std::move(dgrad_attributes), context));
+                    } else if (tag == "CONV_WGRAD") {
+                        auto wgrad_attributes = j_sub_node.get<Conv_wgrad_attributes>();
+                        CHECK_TENSORS(wgrad_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(wgrad_attributes);
+                        sub_nodes.emplace_back(std::make_unique<WgradNode>(std::move(wgrad_attributes), context));
+                    } else if (tag == "MOE_GROUPED_MATMUL") {
+                        auto moe_grouped_matmul_attributes = j_sub_node.get<Moe_grouped_matmul_attributes>();
+                        CHECK_TENSORS(moe_grouped_matmul_attributes);
+                        FILL_GLOBAL_IO_TENSOR_MAP(moe_grouped_matmul_attributes);
+                        sub_nodes.emplace_back(
+                            std::make_unique<MoeGroupedMatmulNode>(std::move(moe_grouped_matmul_attributes), context));
+                    }
+                }
+#undef CHECK_TENSORS
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+#endif
+
+    std::string
+    print(void) const {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        std::stringstream ss;
+        json j = *this;
+        ss << j;
+        return ss.str();
+#else
+        return "print is unavailable when compiled with CUDNN_FRONTEND_SKIP_JSON_LIB";
+#endif
+    }
+};
+
+inline error_t
+Graph::get_behavior_notes_for_plan_at_index(int64_t const index, std::vector<BehaviorNote_t> &notes) const {
+    CHECK_CUDNN_FRONTEND_ERROR(plans.get_behavior_notes_at_index(index, notes));
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::get_behavior_notes(std::vector<BehaviorNote_t> &notes) const {
+    int64_t const candidate = plans.candidate;
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        candidate == -1,
+        error_code_t::INVALID_VALUE,
+        "No candiate plan set for the graph. You can set one by building a plan, which in turn sets the "
+        "candidate internally. Do note that you also query behaviour notes for a created-but-not-built plan by using "
+        "get_behavior_notes_for_plan_at_index API.");
+
+    CHECK_CUDNN_FRONTEND_ERROR(get_behavior_notes_for_plan_at_index(candidate, notes));
+    return {error_code_t::OK, ""};
+}
+
+inline int64_t
+Graph::get_execution_plan_count() const {
+    return plans.execution_plans.size();
+}
+
+inline error_t
+Graph::get_engine_count(int64_t &count) {
+    _CUDNN_CHECK_CUDNN_ERROR(detail::get_attribute(operation_graph->get_raw_desc(),
+                                                   CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   nullptr,
+                                                   &count));
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::get_knobs_for_engine(int64_t const engine, std::vector<Knob> &knobs) {
+    CHECK_CUDNN_FRONTEND_ERROR(detail::query_knobs(engine, operation_graph->get_raw_desc(), knobs));
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::create_execution_plans(std::vector<HeurMode_t> const &mode) {
+    CUDNN_FE_LOG_BANNER("  CREATE EXECUTION PLANS  (HEURISTICS QUERY)  ");
+
+    // CHECK IF NEED TO OVERRIDE HEURISTICS QUERY
+    for (auto &sub_node : sub_nodes) {
+        if (auto [engine_id, user_knobs] = sub_node->override_heuristics_query(); engine_id != -1) {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: Overriding heuristics query with engine ID "
+                                    << engine_id << " and user knobs " << nlohmann::json(user_knobs).dump());
+#else
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: Overriding heuristics query with engine ID "
+                                    << engine_id << " and user knobs " << static_cast<int>(user_knobs.size()));
+#endif
+            CHECK_CUDNN_FRONTEND_ERROR(create_execution_plan(engine_id, user_knobs));
+            return {error_code_t::OK, ""};
+        }
+    }
+
+    EngineConfigList op_graph_to_configs;
+    CHECK_CUDNN_FRONTEND_ERROR(detail::query_cudnn_heuristics_impl(
+        operation_graph, op_graph_to_configs, mode, context.get_target_sm_count(), device_properties));
+
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: Extracting engine configs.");
+
+    plans.set_tag(operation_graph->getTag());
+    plans.enqueue_engine_configs(op_graph_to_configs);
+    plans.set_kernel_cache(kernel_cache);
+
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: Querying engine config properties.");
+    CHECK_CUDNN_FRONTEND_ERROR(plans.query_properties());
+
+    CUDNN_FE_LOG_BANNER("  HEURISTICS QUERY ALL OK  ");
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::create_execution_plan(int64_t const engine_id, std::unordered_map<KnobType_t, int64_t> const &user_knobs) {
+    // first create the engine
+    // this just uses the global engine id and operation graph
+    CUDNN_FE_LOG_BANNER("  CREATE EXECUTION PLAN  for engine id " << engine_id << "  ");
+    detail::backend_descriptor engine(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+    RETURN_CUDNN_FRONTEND_ERROR_IF(engine.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::CUDNN_BACKEND_API_FAILED,
+                                   "Failed to create engine's backend descriptor.");
+    CHECK_CUDNN_FRONTEND_ERROR(
+        detail::create_engine(engine, engine_id, operation_graph->get_raw_desc(), device_properties));
+
+    // Create an array of knob choices
+    std::vector<detail::backend_descriptor> knob_choices;
+    CHECK_CUDNN_FRONTEND_ERROR(detail::set_knob_choices(user_knobs, knob_choices));
+
+    auto engine_config = make_shared_backend_pointer((cudnnBackendDescriptorType_t)CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
+    CHECK_CUDNN_FRONTEND_ERROR(detail::create_engine_config(engine_config, engine, knob_choices));
+    plans.enqueue_engine_configs({engine_config});
+    CHECK_CUDNN_FRONTEND_ERROR(plans.query_properties());
+
+    CUDNN_FE_LOG_BANNER("  CREATE EXECUTION PLAN ALL OK  ");
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::build_plan_at_index(int64_t plan_index) {
+    CHECK_CUDNN_FRONTEND_ERROR(plans.build_plan_at_index(plan_index));
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::build_plans(BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    CUDNN_FE_LOG_BANNER("  BUILD PLANS  for policy " << nlohmann::json(policy).dump() << "  ");
+#else
+    CUDNN_FE_LOG_BANNER("  BUILD PLANS  for policy " << static_cast<int>(policy) << "  ");
+#endif
+    CHECK_CUDNN_FRONTEND_ERROR(plans.build_plans(policy, do_multithreaded_builds));
+    CUDNN_FE_LOG_BANNER("  BUILD PLANS ALL OK  ");
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::build(cudnnHandle_t const &handle,
+             std::vector<HeurMode_t> const &modes,
+             BuildPlanPolicy_t const policy,
+             bool const do_multithreaded_builds) {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    CUDNN_FE_LOG_BANNER(" BUILD with handle " << nlohmann::json(policy).dump());
+#else
+    CUDNN_FE_LOG_BANNER(" BUILD with handle " << static_cast<int>(policy) << "  ");
+#endif
+    CHECK_CUDNN_FRONTEND_ERROR(this->validate());
+    CHECK_CUDNN_FRONTEND_ERROR(this->build_operation_graph(handle));
+    CHECK_CUDNN_FRONTEND_ERROR(this->create_execution_plans(modes));
+    CHECK_CUDNN_FRONTEND_ERROR(this->check_support());
+    CHECK_CUDNN_FRONTEND_ERROR(this->build_plans(policy, do_multithreaded_builds));
+    CUDNN_FE_LOG_BANNER("  BUILD ALL OK (with handle) ");
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+Graph::build(std::vector<HeurMode_t> const &modes, BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    CUDNN_FE_LOG_BANNER("  BUILD PLANS  without handle " << nlohmann::json(policy).dump() << "  ");
+#else
+    CUDNN_FE_LOG_BANNER("  BUILD PLANS  without handle " << static_cast<int>(policy) << "  ");
+#endif
+    CHECK_CUDNN_FRONTEND_ERROR(this->validate());
+    CHECK_CUDNN_FRONTEND_ERROR(this->build_operation_graph());
+    CHECK_CUDNN_FRONTEND_ERROR(this->create_execution_plans(modes));
+    CHECK_CUDNN_FRONTEND_ERROR(this->check_support());
+    CHECK_CUDNN_FRONTEND_ERROR(this->build_plans(policy, do_multithreaded_builds));
+    CUDNN_FE_LOG_BANNER("  BUILD PLANS ALL OK (no handle) ");
+    return {error_code_t::OK, ""};
+}
+
+inline Graph &
+Graph::set_intermediate_data_type(DataType_t const type) {
+    context.set_intermediate_data_type(type);
+    return *this;
+}
+
+inline Graph &
+Graph::set_io_data_type(DataType_t const type) {
+    context.set_io_data_type(type);
+    return *this;
+}
+
+inline Graph &
+Graph::set_compute_data_type(DataType_t const type) {
+    context.set_compute_data_type(type);
+    return *this;
+}
+
+inline Graph &
+Graph::set_dynamic_shape_enabled(bool is_enabled) {
+    context.set_dynamic_shape_enabled(is_enabled);
+    this->is_dynamic_shape_enabled = is_enabled;
+    return *this;
+}
+
+inline Graph &
+Graph::set_kernel_cache(std::shared_ptr<KernelCache> cache) {
+    kernel_cache = cache;
+    return *this;
+}
+
+inline Graph &
+Graph::set_device_properties(std::shared_ptr<const DeviceProperties> device_prop) {
+    device_properties = device_prop;
+    return *this;
+}
+
+inline Graph &
+Graph::set_sm_count(int32_t count) {
+    context.set_target_sm_count(count);
+    return *this;
+}
+
+inline Graph &
+Graph::set_sm_version(int32_t version) {
+    context.set_sm_version(version);
+    return *this;
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::tensor(Tensor_attributes const &tensor) {
+    auto tensor_ptr = std::make_shared<Tensor_attributes>(tensor);
+    full_graph_inputs.emplace(tensor_ptr);
+    return tensor_ptr;
+}
+
+inline error_t
+Graph::query_tensor_attributes_of_uid(int64_t const uid, Tensor_attributes &tensor) const {
+    for (auto const &o_tensor : full_graph_outputs) {
+        if (uid == o_tensor->get_uid()) {
+            tensor = *o_tensor;
+            return {error_code_t::OK, ""};
+        }
+    }
+
+    for (auto const &i_tensor : full_graph_inputs) {
+        if (uid == i_tensor->get_uid()) {
+            tensor = *i_tensor;
+            return {error_code_t::OK, ""};
+        }
+    }
+
+    for (auto const &d_tensor : deserialized_tensor_properties) {
+        if (uid == d_tensor->get_uid()) {
+            tensor = *d_tensor;
+            return {error_code_t::OK, ""};
+        }
+    }
+
+    return {error_code_t::INVALID_VALUE, "No matching tensor for this UID"};
+}
+
+// tensor_like is meant to create "useable" copies of a tensor.
+// By usable, it means not copying over the uids, as uids are FE-level(internal) detail.
+// It also means not copying over names, which are user-level(external) detail. But user is given option to provide a
+// new name.
+inline std::shared_ptr<Tensor_attributes>
+Graph::tensor_like(std::shared_ptr<Tensor_attributes> const &tensor, std::string const &name) {
+    auto tensor_ptr = std::make_shared<Tensor_attributes>(*tensor);
+
+    // reset the uid of the cloned tensor
+    // uids are not meant to be copied by tensor_like
+    // When lowering to cudnn backend, both tensors involved here will get unique uids.
+    tensor_ptr->clear_uid();
+
+    // reset the name too. Defaults to empty string.
+    tensor_ptr->set_name(name);
+    full_graph_inputs.emplace(tensor_ptr);
+
+    return tensor_ptr;
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 6>
+Graph::bn_finalize(std::shared_ptr<Tensor_attributes> sum,
+                   std::shared_ptr<Tensor_attributes> sq_sum,
+                   std::shared_ptr<Tensor_attributes> scale,
+                   std::shared_ptr<Tensor_attributes> bias,
+                   std::shared_ptr<Tensor_attributes> epsilon,
+                   std::shared_ptr<Tensor_attributes> accum_count,
+                   BN_finalize_attributes attributes) {
+    // Set outputs
+    auto EQ_SCALE = attributes.outputs[BN_finalize_attributes::output_names::EQ_SCALE] =
+        output_tensor(attributes.name + "::EQ_SCALE");
+    auto EQ_BIAS = attributes.outputs[BN_finalize_attributes::output_names::EQ_BIAS] =
+        output_tensor(attributes.name + "::EQ_BIAS");
+    auto MEAN = attributes.outputs[BN_finalize_attributes::output_names::MEAN] =
+        output_tensor(attributes.name + "::MEAN");
+    auto INV_VARIANCE = attributes.outputs[BN_finalize_attributes::output_names::INV_VARIANCE] =
+        output_tensor(attributes.name + "::INV_VARIANCE");
+    std::shared_ptr<Tensor_attributes> NEXT_RUNNING_MEAN = nullptr;
+    std::shared_ptr<Tensor_attributes> NEXT_RUNNING_VAR  = nullptr;
+    if (attributes.inputs[BN_finalize_attributes::input_names::PREV_RUNNING_MEAN] &&
+        attributes.inputs[BN_finalize_attributes::input_names::PREV_RUNNING_VAR] &&
+        attributes.inputs[BN_finalize_attributes::input_names::MOMENTUM]) {
+        NEXT_RUNNING_MEAN = output_tensor(attributes.name + "::NEXT_RUNNING_MEAN");
+        NEXT_RUNNING_VAR  = output_tensor(attributes.name + "::NEXT_RUNNING_VAR");
+    }
+    attributes.outputs[BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN] = NEXT_RUNNING_MEAN;
+    attributes.outputs[BN_finalize_attributes::output_names::NEXT_RUNNING_VAR]  = NEXT_RUNNING_VAR;
+
+    // Set inputs
+    attributes.inputs[BN_finalize_attributes::input_names::SUM]         = sum;
+    attributes.inputs[BN_finalize_attributes::input_names::SQ_SUM]      = sq_sum;
+    attributes.inputs[BN_finalize_attributes::input_names::SCALE]       = scale;
+    attributes.inputs[BN_finalize_attributes::input_names::BIAS]        = bias;
+    attributes.inputs[BN_finalize_attributes::input_names::EPSILON]     = epsilon;
+    attributes.inputs[BN_finalize_attributes::input_names::ACCUM_COUNT] = accum_count;
+
+    sub_nodes.emplace_back(std::make_unique<BatchNormFinalizeNode>(std::move(attributes), context));
+
+    return {EQ_SCALE, EQ_BIAS, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::layernorm(std::shared_ptr<Tensor_attributes> x,
+                 std::shared_ptr<Tensor_attributes> scale,
+                 std::shared_ptr<Tensor_attributes> bias,
+                 Layernorm_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Layernorm_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+    std::shared_ptr<Tensor_attributes> MEAN                            = nullptr;
+    std::shared_ptr<Tensor_attributes> INV_VARIANCE                    = nullptr;
+    if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+        MEAN = attributes.outputs[Layernorm_attributes::output_names::MEAN] = output_tensor(attributes.name + "::MEAN");
+        INV_VARIANCE = attributes.outputs[Layernorm_attributes::output_names::INV_VARIANCE] =
+            output_tensor(attributes.name + "::INV_VARIANCE");
+    }
+    // Set inputs
+    attributes.inputs[Layernorm_attributes::input_names::X]     = x;
+    attributes.inputs[Layernorm_attributes::input_names::SCALE] = scale;
+    attributes.inputs[Layernorm_attributes::input_names::BIAS]  = bias;
+
+    sub_nodes.emplace_back(std::make_unique<LayerNormNode>(std::move(attributes), context));
+
+    return {Y, MEAN, INV_VARIANCE};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::adalayernorm(std::shared_ptr<Tensor_attributes> x,
+                    std::shared_ptr<Tensor_attributes> scale,
+                    std::shared_ptr<Tensor_attributes> bias,
+                    AdaLayernorm_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[AdaLayernorm_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+    std::shared_ptr<Tensor_attributes> MEAN                               = nullptr;
+    std::shared_ptr<Tensor_attributes> INV_VARIANCE                       = nullptr;
+    if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+        MEAN = attributes.outputs[AdaLayernorm_attributes::output_names::MEAN] =
+            output_tensor(attributes.name + "::MEAN");
+        INV_VARIANCE = attributes.outputs[AdaLayernorm_attributes::output_names::INV_VARIANCE] =
+            output_tensor(attributes.name + "::INV_VARIANCE");
+    }
+    // Set inputs
+    attributes.inputs[AdaLayernorm_attributes::input_names::X]     = x;
+    attributes.inputs[AdaLayernorm_attributes::input_names::SCALE] = scale;
+    attributes.inputs[AdaLayernorm_attributes::input_names::BIAS]  = bias;
+
+    sub_nodes.emplace_back(std::make_unique<AdaLayerNormNode>(std::move(attributes), context));
+
+    return {std::move(Y), std::move(MEAN), std::move(INV_VARIANCE)};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::instancenorm(std::shared_ptr<Tensor_attributes> x,
+                    std::shared_ptr<Tensor_attributes> scale,
+                    std::shared_ptr<Tensor_attributes> bias,
+                    Instancenorm_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Instancenorm_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+    std::shared_ptr<Tensor_attributes> MEAN                               = nullptr;
+    std::shared_ptr<Tensor_attributes> INV_VARIANCE                       = nullptr;
+    if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+        MEAN = attributes.outputs[Instancenorm_attributes::output_names::MEAN] =
+            output_tensor(attributes.name + "::MEAN");
+        INV_VARIANCE = attributes.outputs[Instancenorm_attributes::output_names::INV_VARIANCE] =
+            output_tensor(attributes.name + "::INV_VARIANCE");
+    }
+    // Set inputs
+    attributes.inputs[Instancenorm_attributes::input_names::X]     = x;
+    attributes.inputs[Instancenorm_attributes::input_names::SCALE] = scale;
+    attributes.inputs[Instancenorm_attributes::input_names::BIAS]  = bias;
+
+    sub_nodes.emplace_back(std::make_unique<InstanceNormNode>(std::move(attributes), context));
+
+    return {Y, MEAN, INV_VARIANCE};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 5>
+Graph::batchnorm(std::shared_ptr<Tensor_attributes> x,
+                 std::shared_ptr<Tensor_attributes> scale,
+                 std::shared_ptr<Tensor_attributes> bias,
+                 Batchnorm_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Batchnorm_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+    auto MEAN = attributes.outputs[Batchnorm_attributes::output_names::MEAN] =
+        output_tensor(attributes.name + "::MEAN");
+    auto INV_VARIANCE = attributes.outputs[Batchnorm_attributes::output_names::INV_VARIANCE] =
+        output_tensor(attributes.name + "::INV_VARIANCE");
+    std::shared_ptr<Tensor_attributes> NEXT_RUNNING_MEAN = nullptr;
+    std::shared_ptr<Tensor_attributes> NEXT_RUNNING_VAR  = nullptr;
+    if (attributes.inputs[Batchnorm_attributes::input_names::PREV_RUNNING_MEAN] &&
+        attributes.inputs[Batchnorm_attributes::input_names::PREV_RUNNING_VAR] &&
+        attributes.inputs[Batchnorm_attributes::input_names::MOMENTUM]) {
+        NEXT_RUNNING_MEAN = output_tensor(attributes.name + "::NEXT_RUNNING_MEAN");
+        NEXT_RUNNING_VAR  = output_tensor(attributes.name + "::NEXT_RUNNING_VAR");
+    }
+    attributes.outputs[Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN] = NEXT_RUNNING_MEAN;
+    attributes.outputs[Batchnorm_attributes::output_names::NEXT_RUNNING_VAR]  = NEXT_RUNNING_VAR;
+
+    // Set inputs
+    attributes.inputs[Batchnorm_attributes::input_names::X]     = x;
+    attributes.inputs[Batchnorm_attributes::input_names::SCALE] = scale;
+    attributes.inputs[Batchnorm_attributes::input_names::BIAS]  = bias;
+
+    sub_nodes.emplace_back(std::make_unique<BatchNormNode>(std::move(attributes), context));
+
+    return {Y, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR};
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::batchnorm_inference(std::shared_ptr<Tensor_attributes> x,
+                           std::shared_ptr<Tensor_attributes> mean,
+                           std::shared_ptr<Tensor_attributes> inv_variance,
+                           std::shared_ptr<Tensor_attributes> scale,
+                           std::shared_ptr<Tensor_attributes> bias,
+                           Batchnorm_inference_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Batchnorm_inference_attributes::output_names::Y] =
+        output_tensor(attributes.name + "::Y");
+
+    // Set inputs
+    attributes.inputs[Batchnorm_inference_attributes::input_names::X]            = x;
+    attributes.inputs[Batchnorm_inference_attributes::input_names::MEAN]         = mean;
+    attributes.inputs[Batchnorm_inference_attributes::input_names::INV_VARIANCE] = inv_variance;
+    attributes.inputs[Batchnorm_inference_attributes::input_names::SCALE]        = scale;
+    attributes.inputs[Batchnorm_inference_attributes::input_names::BIAS]         = bias;
+
+    sub_nodes.emplace_back(std::make_unique<BatchnormInferenceNode>(std::move(attributes), context));
+
+    return Y;
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::batchnorm_backward(std::shared_ptr<Tensor_attributes> dy,
+                          std::shared_ptr<Tensor_attributes> x,
+                          std::shared_ptr<Tensor_attributes> scale,
+                          Batchnorm_backward_attributes attributes) {
+    // Set outputs
+    auto DX = attributes.outputs[Batchnorm_backward_attributes::output_names::DX] =
+        output_tensor(attributes.name + "::DX");
+    auto DSCALE = attributes.outputs[Batchnorm_backward_attributes::output_names::DSCALE] =
+        output_tensor(attributes.name + "::DSCALE");
+    auto DBIAS = attributes.outputs[Batchnorm_backward_attributes::output_names::DBIAS] =
+        output_tensor(attributes.name + "::DBIAS");
+
+    // Set inputs
+    attributes.inputs[Batchnorm_backward_attributes::input_names::DY]    = dy;
+    attributes.inputs[Batchnorm_backward_attributes::input_names::X]     = x;
+    attributes.inputs[Batchnorm_backward_attributes::input_names::SCALE] = scale;
+
+    sub_nodes.emplace_back(std::make_unique<DBNNode>(std::move(attributes), context));
+
+    return {DX, DSCALE, DBIAS};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::instancenorm_backward(std::shared_ptr<Tensor_attributes> dy,
+                             std::shared_ptr<Tensor_attributes> x,
+                             std::shared_ptr<Tensor_attributes> scale,
+                             Instancenorm_backward_attributes attributes) {
+    // Set outputs
+    auto DX = attributes.outputs[Instancenorm_backward_attributes::output_names::DX] =
+        output_tensor(attributes.name + "::DX");
+    auto DSCALE = attributes.outputs[Instancenorm_backward_attributes::output_names::DSCALE] =
+        output_tensor(attributes.name + "::DSCALE");
+    auto DBIAS = attributes.outputs[Instancenorm_backward_attributes::output_names::DBIAS] =
+        output_tensor(attributes.name + "::DBIAS");
+
+    // Set inputs
+    attributes.inputs[Instancenorm_backward_attributes::input_names::DY]    = dy;
+    attributes.inputs[Instancenorm_backward_attributes::input_names::X]     = x;
+    attributes.inputs[Instancenorm_backward_attributes::input_names::SCALE] = scale;
+
+    sub_nodes.emplace_back(std::make_unique<DINNode>(std::move(attributes), context));
+
+    return {DX, DSCALE, DBIAS};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::layernorm_backward(std::shared_ptr<Tensor_attributes> dy,
+                          std::shared_ptr<Tensor_attributes> x,
+                          std::shared_ptr<Tensor_attributes> scale,
+                          Layernorm_backward_attributes attributes) {
+    // Set outputs
+    auto DX = attributes.outputs[Layernorm_backward_attributes::output_names::DX] =
+        output_tensor(attributes.name + "::DX");
+    auto DSCALE = attributes.outputs[Layernorm_backward_attributes::output_names::DSCALE] =
+        output_tensor(attributes.name + "::DSCALE");
+    auto DBIAS = attributes.outputs[Layernorm_backward_attributes::output_names::DBIAS] =
+        output_tensor(attributes.name + "::DBIAS");
+
+    // Set inputs
+    attributes.inputs[Layernorm_backward_attributes::input_names::DY]    = dy;
+    attributes.inputs[Layernorm_backward_attributes::input_names::X]     = x;
+    attributes.inputs[Layernorm_backward_attributes::input_names::SCALE] = scale;
+
+    sub_nodes.emplace_back(std::make_unique<DLNNode>(std::move(attributes), context));
+
+    return {DX, DSCALE, DBIAS};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::adalayernorm_backward(std::shared_ptr<Tensor_attributes> dy,
+                             std::shared_ptr<Tensor_attributes> x,
+                             std::shared_ptr<Tensor_attributes> scale,
+                             AdaLayernorm_backward_attributes attributes) {
+    // Set outputs
+    auto DX = attributes.outputs[AdaLayernorm_backward_attributes::output_names::DX] =
+        output_tensor(attributes.name + "::DX");
+    auto DSCALE = attributes.outputs[AdaLayernorm_backward_attributes::output_names::DSCALE] =
+        output_tensor(attributes.name + "::DSCALE");
+    auto DBIAS = attributes.outputs[AdaLayernorm_backward_attributes::output_names::DBIAS] =
+        output_tensor(attributes.name + "::DBIAS");
+    // Set inputs
+    attributes.inputs[AdaLayernorm_backward_attributes::input_names::DY]    = dy;
+    attributes.inputs[AdaLayernorm_backward_attributes::input_names::X]     = x;
+    attributes.inputs[AdaLayernorm_backward_attributes::input_names::SCALE] = scale;
+
+    sub_nodes.emplace_back(std::make_unique<DAdaLayerNormNode>(std::move(attributes), context));
+
+    return {std::move(DX), std::move(DSCALE), std::move(DBIAS)};
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::conv_fprop(std::shared_ptr<Tensor_attributes> x,
+                  std::shared_ptr<Tensor_attributes> w,
+                  Conv_fprop_attributes attributes) {
+    // Make required output tensors
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    auto Y                                                     = output_tensor(attributes.name + "::Y");
+    attributes.outputs[Conv_fprop_attributes::output_names::Y] = Y;
+
+    // Set inputs
+    attributes.inputs[Conv_fprop_attributes::input_names::X] = x;
+    attributes.inputs[Conv_fprop_attributes::input_names::W] = w;
+
+    sub_nodes.emplace_back(std::make_unique<ConvolutionNode>(std::move(attributes), context));
+
+    return Y;
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 5>
+Graph::dbn_weight(std::shared_ptr<Tensor_attributes> dy,
+                  std::shared_ptr<Tensor_attributes> x,
+                  std::shared_ptr<Tensor_attributes> mean,
+                  std::shared_ptr<Tensor_attributes> inv_variance,
+                  std::shared_ptr<Tensor_attributes> scale,
+                  DBN_weight_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    // Make required output tensors
+    auto DBIAS = attributes.outputs[DBN_weight_attributes::output_names::DBIAS] =
+        output_tensor(attributes.name + "::DBIAS");
+    auto DSCALE = attributes.outputs[DBN_weight_attributes::output_names::DSCALE] =
+        output_tensor(attributes.name + "::DSCALE");
+    auto EQ_BIAS = attributes.outputs[DBN_weight_attributes::output_names::EQ_BIAS] =
+        output_tensor(attributes.name + "::EQ_BIAS");
+    auto EQ_SCALE_DY = attributes.outputs[DBN_weight_attributes::output_names::EQ_SCALE_DY] =
+        output_tensor(attributes.name + "::EQ_SCALE_DY");
+    auto EQ_SCALE_X = attributes.outputs[DBN_weight_attributes::output_names::EQ_SCALE_X] =
+        output_tensor(attributes.name + "::EQ_SCALE_X");
+
+    // Set inputs
+    attributes.inputs[DBN_weight_attributes::input_names::DY]           = dy;
+    attributes.inputs[DBN_weight_attributes::input_names::X]            = x;
+    attributes.inputs[DBN_weight_attributes::input_names::SCALE]        = scale;
+    attributes.inputs[DBN_weight_attributes::input_names::MEAN]         = mean;
+    attributes.inputs[DBN_weight_attributes::input_names::INV_VARIANCE] = inv_variance;
+
+    sub_nodes.emplace_back(std::make_unique<DBNWeightNode>(std::move(attributes), context));
+
+    return {DSCALE, DBIAS, EQ_SCALE_DY, EQ_SCALE_X, EQ_BIAS};
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::conv_dgrad(std::shared_ptr<Tensor_attributes> dy,
+                  std::shared_ptr<Tensor_attributes> w,
+                  Conv_dgrad_attributes attributes) {
+    // Make required output tensors
+    auto DX = attributes.outputs[Conv_dgrad_attributes::output_names::DX] = output_tensor(attributes.name + "::DX");
+
+    // Set inputs
+    attributes.inputs[Conv_dgrad_attributes::input_names::DY] = dy;
+    attributes.inputs[Conv_dgrad_attributes::input_names::W]  = w;
+
+    sub_nodes.emplace_back(std::make_unique<DgradNode>(std::move(attributes), context));
+
+    return DX;
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 2>
+Graph::genstats(std::shared_ptr<Tensor_attributes> x, Genstats_attributes attributes) {
+    // Set outputs
+    auto SUM = attributes.outputs[Genstats_attributes::output_names::SUM] =
+        output_tensor(attributes.name + "_sum_output");
+    auto SQ_SUM = attributes.outputs[Genstats_attributes::output_names::SQ_SUM] =
+        output_tensor(attributes.name + "_sq_sum_output");
+
+    // Set inputs
+    attributes.inputs[Genstats_attributes::input_names::X] = x;
+
+    sub_nodes.emplace_back(std::make_unique<GenstatsNode>(std::move(attributes), context));
+
+    return {SUM, SQ_SUM};
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::conv_wgrad(std::shared_ptr<Tensor_attributes> dy,
+                  std::shared_ptr<Tensor_attributes> x,
+                  Conv_wgrad_attributes attributes) {
+    // Make required output tensors
+    auto DW = attributes.outputs[Conv_wgrad_attributes::output_names::DW] = output_tensor(attributes.name + "::DW");
+
+    // Set inputs
+    attributes.inputs[Conv_wgrad_attributes::input_names::X]  = x;
+    attributes.inputs[Conv_wgrad_attributes::input_names::DY] = dy;
+
+    sub_nodes.emplace_back(std::make_unique<WgradNode>(std::move(attributes), context));
+
+    return DW;
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 2>
+Graph::rmsnorm(std::shared_ptr<Tensor_attributes> x,
+               std::shared_ptr<Tensor_attributes> scale,
+               Rmsnorm_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Rmsnorm_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+    std::shared_ptr<Tensor_attributes> INV_VARIANCE                  = nullptr;
+    if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+        INV_VARIANCE = attributes.outputs[Rmsnorm_attributes::output_names::INV_VARIANCE] =
+            output_tensor(attributes.name + "::INV_VARIANCE");
+    }
+    // Set inputs
+    attributes.inputs[Rmsnorm_attributes::input_names::X]     = x;
+    attributes.inputs[Rmsnorm_attributes::input_names::SCALE] = scale;
+
+    sub_nodes.emplace_back(std::make_unique<RMSNormNode>(std::move(attributes), context));
+
+    return {Y, INV_VARIANCE};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::rmsnorm_backward(std::shared_ptr<Tensor_attributes> dy,
+                        std::shared_ptr<Tensor_attributes> x,
+                        std::shared_ptr<Tensor_attributes> scale,
+                        std::shared_ptr<Tensor_attributes> inv_variance,
+                        Rmsnorm_backward_attributes attributes) {
+    // Set outputs
+    auto DX = attributes.outputs[Rmsnorm_backward_attributes::output_names::DX] =
+        output_tensor(attributes.name + "::DX");
+    auto DScale = attributes.outputs[Rmsnorm_backward_attributes::output_names::DSCALE] =
+        output_tensor(attributes.name + "::Dscale");
+    std::shared_ptr<Tensor_attributes> DBias = nullptr;
+    if (attributes.use_dbias.value_or(true)) {
+        DBias = attributes.outputs[Rmsnorm_backward_attributes::output_names::DBIAS] =
+            output_tensor(attributes.name + "::Dbias");
+    }
+
+    // Set inputs
+    attributes.inputs[Rmsnorm_backward_attributes::input_names::DY]           = dy;
+    attributes.inputs[Rmsnorm_backward_attributes::input_names::X]            = x;
+    attributes.inputs[Rmsnorm_backward_attributes::input_names::SCALE]        = scale;
+    attributes.inputs[Rmsnorm_backward_attributes::input_names::INV_VARIANCE] = inv_variance;
+
+    sub_nodes.emplace_back(std::make_unique<DRMSNormNode>(std::move(attributes), context));
+
+    return {DX, DScale, DBias};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 2>
+Graph::sdpa(std::shared_ptr<Tensor_attributes> q,
+            std::shared_ptr<Tensor_attributes> k,
+            std::shared_ptr<Tensor_attributes> v,
+            SDPA_attributes attributes) {
+    //优化性能
+    if (attributes.mma_core_mode == DataType_t::NOT_SET) {
+        attributes._set_mma_core_mode(DataType_t::HALF);
+    }
+
+    // Call internal implementation and return only the O and Stats outputs for backward compatibility
+    auto internal_result = sdpa_internal(q, k, v, std::move(attributes));
+    return {internal_result.O, internal_result.Stats};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 4>
+Graph::sdpa_fp8(std::shared_ptr<Tensor_attributes> q,
+                std::shared_ptr<Tensor_attributes> k,
+                std::shared_ptr<Tensor_attributes> v,
+                std::shared_ptr<Tensor_attributes> descale_q,
+                std::shared_ptr<Tensor_attributes> descale_k,
+                std::shared_ptr<Tensor_attributes> descale_v,
+                std::shared_ptr<Tensor_attributes> descale_s,
+                std::shared_ptr<Tensor_attributes> scale_s,
+                std::shared_ptr<Tensor_attributes> scale_o,
+                SDPA_fp8_attributes attributes) {
+    if (attributes.mma_core_mode == DataType_t::NOT_SET) {
+        attributes._set_mma_core_mode(DataType_t::FP8_E4M3);
+    }
+
+    // Set FP8 scaling inputs
+    attributes.inputs[SDPA_fp8_attributes::input_names::Descale_Q] = descale_q;
+    attributes.inputs[SDPA_fp8_attributes::input_names::Descale_K] = descale_k;
+    attributes.inputs[SDPA_fp8_attributes::input_names::Descale_V] = descale_v;
+    attributes.inputs[SDPA_fp8_attributes::input_names::Descale_S] = descale_s;
+    attributes.inputs[SDPA_fp8_attributes::input_names::Scale_S]   = scale_s;
+    attributes.inputs[SDPA_fp8_attributes::input_names::Scale_O]   = scale_o;
+
+    // Call internal implementation and return {Output, Stats, Amax_S, Amax_O} as array for backward compatibility
+    auto internal_result = sdpa_internal(q, k, v, std::move(attributes));
+    return {internal_result.O, internal_result.Stats, internal_result.Amax_S, internal_result.Amax_O};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 7>
+Graph::sdpa_fp8_backward(std::shared_ptr<Tensor_attributes> q,
+                         std::shared_ptr<Tensor_attributes> k,
+                         std::shared_ptr<Tensor_attributes> v,
+                         std::shared_ptr<Tensor_attributes> o,
+                         std::shared_ptr<Tensor_attributes> dO,
+                         std::shared_ptr<Tensor_attributes> Stats,
+                         std::shared_ptr<Tensor_attributes> descale_q,
+                         std::shared_ptr<Tensor_attributes> descale_k,
+                         std::shared_ptr<Tensor_attributes> descale_v,
+                         std::shared_ptr<Tensor_attributes> descale_o,
+                         std::shared_ptr<Tensor_attributes> descale_do,
+                         std::shared_ptr<Tensor_attributes> descale_s,
+                         std::shared_ptr<Tensor_attributes> descale_dp,
+                         std::shared_ptr<Tensor_attributes> scale_s,
+                         std::shared_ptr<Tensor_attributes> scale_dq,
+                         std::shared_ptr<Tensor_attributes> scale_dk,
+                         std::shared_ptr<Tensor_attributes> scale_dv,
+                         std::shared_ptr<Tensor_attributes> scale_dp,
+                         SDPA_fp8_backward_attributes attributes) {
+    // Make required output tensors
+    auto dQ = attributes.outputs[SDPA_fp8_backward_attributes::output_names::dQ] =
+        output_tensor(attributes.name + "::dQ");
+    auto dK = attributes.outputs[SDPA_fp8_backward_attributes::output_names::dK] =
+        output_tensor(attributes.name + "::dK");
+    auto dV = attributes.outputs[SDPA_fp8_backward_attributes::output_names::dV] =
+        output_tensor(attributes.name + "::dV");
+    auto Amax_dQ = attributes.outputs[SDPA_fp8_backward_attributes::output_names::Amax_dQ] =
+        output_tensor(attributes.name + "::Amax_dQ");
+    auto Amax_dK = attributes.outputs[SDPA_fp8_backward_attributes::output_names::Amax_dK] =
+        output_tensor(attributes.name + "::Amax_dK");
+    auto Amax_dV = attributes.outputs[SDPA_fp8_backward_attributes::output_names::Amax_dV] =
+        output_tensor(attributes.name + "::Amax_dV");
+    auto Amax_dP = attributes.outputs[SDPA_fp8_backward_attributes::output_names::Amax_dP] =
+        output_tensor(attributes.name + "::Amax_dP");
+
+    // Set inputs
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Q]     = q;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::K]     = k;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::V]     = v;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::O]     = o;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Stats] = Stats;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::dO]    = dO;
+
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_Q]  = descale_q;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_K]  = descale_k;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_V]  = descale_v;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_S]  = descale_s;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_O]  = descale_o;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_dO] = descale_do;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Descale_dP] = descale_dp;
+
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Scale_dQ] = scale_dq;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Scale_dK] = scale_dk;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Scale_dV] = scale_dv;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Scale_S]  = scale_s;
+    attributes.inputs[SDPA_fp8_backward_attributes::input_names::Scale_dP] = scale_dp;
+
+    sub_nodes.emplace_back(std::make_unique<SDPAFP8BackwardNode>(std::move(attributes), context));
+
+    return {dQ, dK, dV, Amax_dQ, Amax_dK, Amax_dV, Amax_dP};
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 3>
+Graph::sdpa_backward(std::shared_ptr<Tensor_attributes> q,
+                     std::shared_ptr<Tensor_attributes> k,
+                     std::shared_ptr<Tensor_attributes> v,
+                     std::shared_ptr<Tensor_attributes> o,
+                     std::shared_ptr<Tensor_attributes> dO,
+                     std::shared_ptr<Tensor_attributes> stats,
+                     SDPA_backward_attributes attributes) {
+    // Set inputs
+    attributes.inputs[SDPA_backward_attributes::input_names::Q]     = q;
+    attributes.inputs[SDPA_backward_attributes::input_names::K]     = k;
+    attributes.inputs[SDPA_backward_attributes::input_names::V]     = v;
+    attributes.inputs[SDPA_backward_attributes::input_names::O]     = o;
+    attributes.inputs[SDPA_backward_attributes::input_names::dO]    = dO;
+    attributes.inputs[SDPA_backward_attributes::input_names::Stats] = stats;
+
+    // Make required output tensors
+    auto dQ = attributes.outputs[SDPA_backward_attributes::output_names::dQ] = output_tensor(attributes.name + "::dQ");
+    auto dK = attributes.outputs[SDPA_backward_attributes::output_names::dK] = output_tensor(attributes.name + "::dK");
+    auto dV = attributes.outputs[SDPA_backward_attributes::output_names::dV] = output_tensor(attributes.name + "::dV");
+
+    sub_nodes.emplace_back(std::make_unique<CompositeSDPABackwardNode>(std::move(attributes), context));
+
+    return {dQ, dK, dV};
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::slice(std::shared_ptr<Tensor_attributes> input, Slice_attributes attributes) {
+    attributes.inputs[Slice_attributes::input_names::X] = input;
+    auto Y = attributes.outputs[Slice_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+
+    sub_nodes.emplace_back(std::make_unique<SliceNode>(std::move(attributes), context));
+    return Y;
+}
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 2>
+Graph::block_scale_quantize(std::shared_ptr<Tensor_attributes> x, Block_scale_quantize_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Block_scale_quantize_attributes::output_names::Y] =
+        output_tensor(attributes.name + "::Y");
+    auto scale = attributes.outputs[Block_scale_quantize_attributes::output_names::scale] =
+        output_tensor(attributes.name + "::scale");
+
+    // Set inputs
+    attributes.inputs[Block_scale_quantize_attributes::input_names::X] = x;
+
+    sub_nodes.emplace_back(std::make_unique<BlockScaleQuantizeNode>(std::move(attributes), context));
+
+    return {Y, scale};
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::block_scale_dequantize(std::shared_ptr<Tensor_attributes> x,
+                              std::shared_ptr<Tensor_attributes> scale,
+                              Block_scale_dequantize_attributes attributes) {
+    // Set outputs
+    auto Y = attributes.outputs[Block_scale_dequantize_attributes::output_names::Y] =
+        output_tensor(attributes.name + "::Y");
+
+    // Set inputs
+    attributes.inputs[Block_scale_dequantize_attributes::input_names::X]     = x;
+    attributes.inputs[Block_scale_dequantize_attributes::input_names::scale] = scale;
+
+    sub_nodes.emplace_back(std::make_unique<BlockScaleDequantizeNode>(std::move(attributes), context));
+
+    return Y;
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::concatenate(std::vector<std::shared_ptr<Tensor_attributes>> x, Concatenate_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+
+    // Set outputs
+    auto Y = attributes.outputs[Concatenate_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+
+    // Set inputs
+    for (auto &element : x) {
+        attributes.inputs.push_back(element);
+    }
+
+    sub_nodes.emplace_back(std::make_unique<ConcatenateNode>(std::move(attributes), context));
+
+    return Y;
+}
+
+inline std::shared_ptr<Tensor_attributes>
+Graph::moe_grouped_matmul(std::shared_ptr<Tensor_attributes> token,
+                          std::shared_ptr<Tensor_attributes> weight,
+                          std::shared_ptr<Tensor_attributes> first_token_offset,
+                          std::shared_ptr<Tensor_attributes> token_index,
+                          std::shared_ptr<Tensor_attributes> token_ks,
+                          Moe_grouped_matmul_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+
+    auto output = attributes.outputs[Moe_grouped_matmul_attributes::output_names::Output] =
+        output_tensor(attributes.name + "::Output");
+
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::Token]            = token;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::Weight]           = weight;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::FirstTokenOffset] = first_token_offset;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::TokenIndex]       = token_index;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::TokenKs]          = token_ks;
+
+    sub_nodes.emplace_back(std::make_unique<MoeGroupedMatmulNode>(std::move(attributes), context));
+
+    return output;
+}
+
+static inline std::ostream &
+operator<<(std::ostream &os, Graph const &graph) {
+    os << graph.print();
+    return os;
+}
+
+}  // namespace cudnn_frontend::graph
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/graph_properties.h b/third_party/cudnn-frontend/include/cudnn_frontend/graph_properties.h
new file mode 100644
index 00000000..03b31b56
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/graph_properties.h
@@ -0,0 +1,2655 @@
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+#include <limits.h>
+
+#include "context.h"
+
+#include "../cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+using managed_backend_descriptor_t = std::vector<ManagedOpaqueDescriptor>;
+
+// simple structure to hold all properties of a tensor.
+// Each property has a getter setter.
+class Tensor_attributes {
+   public:
+    using uid_t = int64_t;
+
+    // There are two usecases of pass by value tensors:
+    // 1. Fused scalar constants
+    // 2. Scalar passed during execution
+    // In approach 1, users provide a value to embed into the graph.
+    // In approach 2, users set is_pass_by_value boolean and then pass a pointer to scalar value with execute() API.
+    // A closed set of types that are allowed to be passed by value.
+    using pass_by_values_t = std::variant<int64_t, int32_t, half, float, nv_bfloat16>;
+
+    error_t
+    validate() const {
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            dim.empty(), error_code_t::ATTRIBUTE_NOT_SET, "Tensor '" + name + "' dims not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            stride.empty(), error_code_t::ATTRIBUTE_NOT_SET, "Tensor '" + name + "' strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(dim.size() != stride.size(),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Tensor '" + name + "' does not equal dimensionality in dim and stride.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            is_virtual && is_pass_by_value,
+            error_code_t::ATTRIBUTE_NOT_SET,
+            "Tensor '" + name + "' can't be both virutal and pass_by_value at the same time.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            pass_by_value.has_value() & (!is_pass_by_value),
+            error_code_t::ATTRIBUTE_NOT_SET,
+            "Tensor '" + name + "' can't be a fused scalar and not a pass_by_value tensor at the same time.");
+
+        return {error_code_t::OK, ""};
+    }
+
+   private:
+    template <typename>
+    friend class Attributes;
+
+    std::string name;
+    DataType_t data_type        = DataType_t::NOT_SET;
+    std::vector<int64_t> dim    = {};
+    std::vector<int64_t> stride = {};
+    bool is_virtual             = false;
+
+    std::optional<pass_by_values_t> pass_by_value = std::nullopt;
+    bool is_pass_by_value                         = false;
+
+    TensorReordering_t reordering_type = TensorReordering_t::NONE;
+    uid_t uid                          = 0;
+    bool uid_assigned                  = false;
+
+    std::shared_ptr<Tensor_attributes> ragged_offset;
+    int64_t alignment        = 16;  // Default to 16 bytes
+    int64_t vector_count     = 1;   // Default to 1 (no vectorization)
+    int64_t vector_dimension = -1;  // Default to -1 (not set)
+
+    auto
+    fill_from_context(detail::Context const& context) -> Tensor_attributes& {
+        if (get_data_type() == DataType_t::NOT_SET) {
+            if (get_is_virtual()) {
+                set_data_type(context.get_intermediate_data_type());
+            } else {
+                set_data_type(context.get_io_data_type());
+            }
+        }
+        return *this;
+    }
+
+   public:
+    // Serialization functions
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    friend void
+    to_json(nlohmann::json& j, const Tensor_attributes& ta);
+    friend void
+    from_json(const nlohmann::json& j, Tensor_attributes& ta);
+#endif
+
+    Tensor_attributes() = default;
+
+    Tensor_attributes(float const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::FLOAT;
+    }
+
+    Tensor_attributes(half const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::HALF;
+    }
+
+    Tensor_attributes(nv_bfloat16 const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::BFLOAT16;
+    }
+
+    Tensor_attributes(int32_t const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::INT32;
+    }
+
+    Tensor_attributes(int64_t const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::INT64;
+    }
+
+    std::string
+    get_name() const {
+        return name;
+    }
+
+    auto
+    set_name(std::string const& value) -> Tensor_attributes& {
+        name = value;
+        return *this;
+    }
+
+    DataType_t
+    get_data_type() const {
+        return data_type;
+    }
+
+    auto
+    set_data_type(DataType_t const value) -> Tensor_attributes& {
+        data_type = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_dim() const {
+        return dim;
+    }
+
+    auto
+    set_dim(std::vector<int64_t> const& value) -> Tensor_attributes& {
+        dim = value;
+        return *this;
+    }
+
+    int64_t
+    get_volume() const {
+        int64_t volume = 1ul;
+        for (int64_t d : dim) {
+            volume *= d;
+        }
+        return volume;
+    }
+
+    std::vector<int64_t>
+    get_stride() const {
+        return stride;
+    }
+
+    auto
+    set_stride(std::vector<int64_t> const& value) -> Tensor_attributes& {
+        stride = value;
+        return *this;
+    }
+
+    bool
+    get_is_virtual() const {
+        return is_virtual;
+    }
+
+    std::shared_ptr<Tensor_attributes>
+    get_ragged_offset() {
+        return ragged_offset;
+    }
+
+    auto
+    set_is_virtual(bool const value) -> Tensor_attributes& {
+        is_virtual = value;
+        return *this;
+    }
+
+    auto
+    set_output(bool const value) -> Tensor_attributes& {
+        return set_is_virtual(!value);
+    }
+
+    std::optional<pass_by_values_t>
+    get_pass_by_value() const {
+        return pass_by_value;
+    }
+
+    bool
+    get_is_pass_by_value() const {
+        return is_pass_by_value;
+    }
+
+    auto
+    set_is_pass_by_value(bool const value) -> Tensor_attributes& {
+        is_pass_by_value = value;
+        return *this;
+    }
+
+    TensorReordering_t
+    get_reordering_type() const {
+        return reordering_type;
+    }
+
+    auto
+    set_reordering_type(TensorReordering_t const value) -> Tensor_attributes& {
+        reordering_type = value;
+        return *this;
+    }
+
+    int64_t
+    get_alignment() const {
+        return alignment;
+    }
+
+    auto
+    set_alignment(int64_t const value) -> Tensor_attributes& {
+        alignment = value;
+        return *this;
+    }
+
+    int64_t
+    get_vector_count() const {
+        return vector_count;
+    }
+
+    int64_t
+    get_vector_dimension() const {
+        return vector_dimension;
+    }
+
+    auto
+    set_vector_count_and_dimension(int64_t const count, int64_t const dimension) -> Tensor_attributes& {
+        vector_count     = count;
+        vector_dimension = dimension;
+        return *this;
+    }
+
+    uid_t
+    get_uid() const {
+        return uid;
+    }
+
+    uid_t
+    has_uid() const {
+        return uid_assigned;
+    }
+
+    auto
+    clear_uid(void) -> Tensor_attributes& {
+        uid          = 0;
+        uid_assigned = false;
+        return *this;
+    }
+
+    auto
+    set_uid(uid_t value) -> Tensor_attributes& {
+        uid          = value;
+        uid_assigned = true;
+        return *this;
+    }
+
+    auto
+    set_ragged_offset(std::shared_ptr<Tensor_attributes> const& value) -> Tensor_attributes& {
+        ragged_offset = value;
+        return *this;
+    }
+};
+
+class Batchnorm_attributes;
+class Batchnorm_backward_attributes;
+class Concatenate_attributes;
+
+template <typename DerivedT>
+class Attributes {
+    DerivedT&
+    self() {
+        return *static_cast<DerivedT*>(this);
+    }
+    DerivedT const&
+    self() const {
+        return *static_cast<DerivedT const*>(this);
+    }
+
+   protected:
+    std::vector<int64_t>
+    get_non_virtual_uids() const {
+        std::vector<int64_t> non_virtual_uids;
+        auto derived = static_cast<DerivedT const*>(this);
+        if constexpr (std::is_same_v<DerivedT, Concatenate_attributes>) {
+            for (auto tensor : derived->inputs) {
+                if (tensor && tensor->get_is_virtual() == false) {
+                    non_virtual_uids.push_back(tensor->get_uid());
+                    if (auto ragged_offset = tensor->get_ragged_offset()) {
+                        non_virtual_uids.push_back(ragged_offset->get_uid());
+                    }
+                }
+            }
+        } else {
+            for (auto& [name, tensor] : derived->inputs) {
+                (void)name;
+                if (tensor && tensor->get_is_virtual() == false) {
+                    non_virtual_uids.push_back(tensor->get_uid());
+                    if (auto ragged_offset = tensor->get_ragged_offset()) {
+                        non_virtual_uids.push_back(ragged_offset->get_uid());
+                    }
+                }
+            }
+        }
+
+        for (auto& [name, tensor] : derived->outputs) {
+            (void)name;
+            if (tensor && tensor->get_is_virtual() == false) {
+                non_virtual_uids.push_back(tensor->get_uid());
+                if (auto ragged_offset = tensor->get_ragged_offset()) {
+                    non_virtual_uids.push_back(ragged_offset->get_uid());
+                }
+            }
+        }
+
+        // Handle special case of BN where peer_stats is also an input
+        if constexpr (std::is_same_v<DerivedT, Batchnorm_attributes> ||
+                      std::is_same_v<DerivedT, Batchnorm_backward_attributes>) {
+            for (auto& tensor : derived->peer_stats) {
+                if (tensor && tensor->get_is_virtual() == false) {
+                    non_virtual_uids.push_back(tensor->get_uid());
+                    if (auto ragged_offset = tensor->get_ragged_offset()) {
+                        non_virtual_uids.push_back(ragged_offset->get_uid());
+                    }
+                }
+            }
+        }
+
+        return non_virtual_uids;
+    }
+
+   public:
+    error_t
+    fill_pass_by_value(std::unordered_map<Tensor_attributes::uid_t, Tensor_attributes::pass_by_values_t>&
+                           tensor_to_pass_by_value) const {
+        auto derived = static_cast<DerivedT const*>(this);
+        if constexpr (std::is_same_v<DerivedT, Concatenate_attributes>) {
+            for (auto& tensor : derived->inputs) {
+                if (tensor && tensor->get_pass_by_value().has_value()) {
+                    tensor_to_pass_by_value.emplace(tensor->get_uid(), tensor->get_pass_by_value().value());
+                }
+            }
+        } else {
+            for (auto& [name, tensor] : derived->inputs) {
+                (void)name;
+                if (tensor && tensor->get_pass_by_value().has_value()) {
+                    tensor_to_pass_by_value.emplace(tensor->get_uid(), tensor->get_pass_by_value().value());
+                }
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    void
+    fill_from_context(detail::Context const& context) {
+        auto derived = static_cast<DerivedT const*>(this);
+
+        if constexpr (std::is_same_v<DerivedT, Concatenate_attributes>) {
+            for (auto& tensor : derived->inputs) {
+                if (tensor) {
+                    tensor->fill_from_context(context);
+                }
+            }
+        } else {
+            for (auto& [name, tensor] : derived->inputs) {
+                (void)name;
+                if (tensor) {
+                    tensor->fill_from_context(context);
+                }
+            }
+        }
+        for (auto& [name, tensor] : derived->outputs) {
+            (void)name;
+            if (tensor) {
+                tensor->fill_from_context(context);
+            }
+        }
+        // Handle special case of BN where peer_stats is also an input
+        if constexpr (std::is_same_v<DerivedT, Batchnorm_attributes> ||
+                      std::is_same_v<DerivedT, Batchnorm_backward_attributes>) {
+            for (auto& tensor : derived->peer_stats) {
+                if (tensor) {
+                    tensor->fill_from_context(context);
+                }
+            }
+        }
+
+        if (compute_data_type == DataType_t::NOT_SET) {
+            set_compute_data_type(context.get_compute_data_type());
+        }
+
+        // Handle shape and stride inferencing for fused scalars.
+        // Pick number of dimensions from anyone of non-fused-scalar input/output tensors
+        // In case, all tensors are fused scalars, just keep them 1D.
+        int64_t number_of_dims = 1;
+        if constexpr (std::is_same_v<DerivedT, Concatenate_attributes>) {
+            for (auto tensor : derived->inputs) {
+                if (tensor && (tensor->get_pass_by_value().has_value() == false)) {
+                    number_of_dims = tensor->get_dim().size();
+                    break;
+                }
+            }
+        } else {
+            for (auto [name, tensor] : derived->inputs) {
+                (void)name;
+                if (tensor && (tensor->get_pass_by_value().has_value() == false)) {
+                    number_of_dims = tensor->get_dim().size();
+                    break;
+                }
+            }
+        }
+
+        // If number of dims is still 1, try to see if user set output dims.
+        if (number_of_dims == 1) {
+            for (auto [name, tensor] : derived->outputs) {
+                (void)name;
+                if (tensor && (tensor->get_pass_by_value().has_value() == false)) {
+                    number_of_dims = tensor->get_dim().size();
+                    break;
+                }
+            }
+        }
+
+        if constexpr (std::is_same_v<DerivedT, Concatenate_attributes>) {
+            for (auto tensor : derived->inputs) {
+                if (tensor && tensor->get_pass_by_value().has_value()) {
+                    tensor->set_dim(std::vector<int64_t>(number_of_dims, 1));
+                    tensor->set_stride(std::vector<int64_t>(number_of_dims, 1));
+                }
+            }
+        } else {
+            for (auto [name, tensor] : derived->inputs) {
+                (void)name;
+                if (tensor && tensor->get_pass_by_value().has_value()) {
+                    tensor->set_dim(std::vector<int64_t>(number_of_dims, 1));
+                    tensor->set_stride(std::vector<int64_t>(number_of_dims, 1));
+                }
+            }
+        }
+    }
+
+    std::string name;
+    DataType_t compute_data_type = DataType_t::NOT_SET;
+
+    DerivedT&
+    set_name(std::string const& value) {
+        name = value;
+        return self();
+    }
+
+    DerivedT&
+    set_compute_data_type(DataType_t value) {
+        compute_data_type = value;
+        return self();
+    }
+};
+
+class BN_finalize_attributes : public Attributes<BN_finalize_attributes> {
+    friend class Attributes<BN_finalize_attributes>;
+    friend class BatchNormFinalizeNode;
+    friend class Graph;
+
+   public:
+    enum class input_names {
+        SUM,
+        SQ_SUM,
+        SCALE,
+        BIAS,
+        EPSILON,
+        ACCUM_COUNT,
+        PREV_RUNNING_MEAN,
+        PREV_RUNNING_VAR,
+        MOMENTUM
+    };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { EQ_SCALE, EQ_BIAS, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR };
+
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(BN_finalize_attributes, name, compute_data_type, inputs, outputs)
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+
+    BN_finalize_attributes&
+    set_previous_running_stats(std::shared_ptr<Tensor_attributes>& mean,
+                               std::shared_ptr<Tensor_attributes>& variance,
+                               std::shared_ptr<Tensor_attributes>& momentum) {
+        inputs[BN_finalize_attributes::input_names::PREV_RUNNING_MEAN] = mean;
+        inputs[BN_finalize_attributes::input_names::PREV_RUNNING_VAR]  = variance;
+        inputs[BN_finalize_attributes::input_names::MOMENTUM]          = momentum;
+        return *this;
+    }
+};
+
+class Genstats_attributes : public Attributes<Genstats_attributes> {
+    friend class Attributes<Genstats_attributes>;
+    friend class GenstatsNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+
+    enum class output_names { SUM, SQ_SUM };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Genstats_attributes, name, compute_data_type, inputs, outputs)
+};
+
+class Conv_fprop_attributes : public Attributes<Conv_fprop_attributes> {
+    friend class Attributes<Conv_fprop_attributes>;
+    friend class ConvolutionNode;
+    friend class Graph;
+
+    std::vector<int64_t> pre_padding;
+    std::vector<int64_t> post_padding;
+    std::vector<int64_t> stride;
+    std::vector<int64_t> dilation;
+
+    ConvolutionMode_t math_mode = ConvolutionMode_t::CROSS_CORRELATION;
+
+   public:
+    enum class input_names { X, W };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_fprop_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   dilation,
+                                   math_mode)
+
+    ConvolutionMode_t
+    get_convolution_mode() const {
+        return math_mode;
+    }
+
+    std::vector<int64_t>
+    get_pre_padding() const {
+        return pre_padding;
+    }
+
+    std::vector<int64_t>
+    get_post_padding() const {
+        return post_padding;
+    }
+
+    Conv_fprop_attributes&
+    set_padding(std::vector<int64_t> value) {
+        pre_padding  = value;
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_fprop_attributes&
+    set_pre_padding(std::vector<int64_t> value) {
+        pre_padding = value;
+        return *this;
+    }
+
+    Conv_fprop_attributes&
+    set_post_padding(std::vector<int64_t> value) {
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_fprop_attributes&
+    set_convolution_mode(ConvolutionMode_t mode_) {
+        math_mode = mode_;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_stride() const {
+        return stride;
+    }
+
+    Conv_fprop_attributes&
+    set_stride(std::vector<int64_t> value) {
+        stride = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_dilation() const {
+        return dilation;
+    }
+
+    Conv_fprop_attributes&
+    set_dilation(std::vector<int64_t> value) {
+        dilation = value;
+        return *this;
+    }
+};
+
+class Batchnorm_backward_attributes : public Attributes<Batchnorm_backward_attributes> {
+    friend class Attributes<Batchnorm_backward_attributes>;
+    friend class DBNNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    // Only special case where one of the inputs is a vector.
+    std::vector<std::shared_ptr<Tensor_attributes>> peer_stats;
+    enum class output_names { DX, DSCALE, DBIAS };
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_backward_attributes, name, compute_data_type, inputs, peer_stats, outputs)
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+
+    Batchnorm_backward_attributes&
+    set_saved_mean_and_inv_variance(std::shared_ptr<Tensor_attributes> mean,
+                                    std::shared_ptr<Tensor_attributes> inv_variance) {
+        inputs[Batchnorm_backward_attributes::input_names::MEAN]         = mean;
+        inputs[Batchnorm_backward_attributes::input_names::INV_VARIANCE] = inv_variance;
+        return *this;
+    }
+
+    Batchnorm_backward_attributes&
+    set_peer_stats(std::vector<std::shared_ptr<Tensor_attributes>> const& input_peer_stats) {
+        peer_stats = input_peer_stats;
+        return *this;
+    }
+};
+
+class DBN_weight_attributes : public Attributes<DBN_weight_attributes> {
+    friend class Attributes<DBN_weight_attributes>;
+    friend class DBNWeightNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DSCALE, DBIAS, EQ_BIAS, EQ_SCALE_DY, EQ_SCALE_X };
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(DBN_weight_attributes, name, compute_data_type, inputs, outputs)
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+};
+
+class Conv_dgrad_attributes : public Attributes<Conv_dgrad_attributes> {
+    friend class Attributes<Conv_dgrad_attributes>;
+    friend class DgradNode;
+    friend class Graph;
+
+    std::vector<int64_t> pre_padding;
+    std::vector<int64_t> post_padding;
+    std::vector<int64_t> stride;
+    std::vector<int64_t> dilation;
+
+    ConvolutionMode_t math_mode = ConvolutionMode_t::CROSS_CORRELATION;
+
+   public:
+    enum class input_names { DY, W };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_dgrad_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   dilation,
+                                   math_mode)
+
+    ConvolutionMode_t
+    get_convolution_mode() const {
+        return math_mode;
+    }
+
+    std::vector<int64_t>
+    get_pre_padding() const {
+        return pre_padding;
+    }
+
+    std::vector<int64_t>
+    get_post_padding() const {
+        return post_padding;
+    }
+
+    Conv_dgrad_attributes&
+    set_padding(std::vector<int64_t> value) {
+        pre_padding  = value;
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_dgrad_attributes&
+    set_pre_padding(std::vector<int64_t> value) {
+        pre_padding = value;
+        return *this;
+    }
+
+    Conv_dgrad_attributes&
+    set_post_padding(std::vector<int64_t> value) {
+        post_padding = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_stride() const {
+        return stride;
+    }
+
+    Conv_dgrad_attributes&
+    set_convolution_mode(ConvolutionMode_t mode_) {
+        math_mode = mode_;
+        ;
+        return *this;
+    }
+
+    Conv_dgrad_attributes&
+    set_stride(std::vector<int64_t> value) {
+        stride = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_dilation() const {
+        return dilation;
+    }
+
+    Conv_dgrad_attributes&
+    set_dilation(std::vector<int64_t> value) {
+        dilation = value;
+        return *this;
+    }
+};
+
+class Matmul_fp8_attributes : public Attributes<Matmul_fp8_attributes> {
+    friend class Attributes<Matmul_fp8_attributes>;
+    friend class MatmulFP8Node;
+    friend class INode;
+
+    double padding_value = 0.0;
+
+   public:
+    enum class input_names { Descale_A, Descale_B, A, B, M_override, N_override, K_override, Scale_C };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { C, Amax_C };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Matmul_fp8_attributes, name, compute_data_type, inputs, outputs)
+
+    Matmul_fp8_attributes&
+    set_m_override(std::shared_ptr<Tensor_attributes> const& value) {
+        inputs[input_names::M_override] = value;
+        return *this;
+    }
+
+    Matmul_fp8_attributes&
+    set_n_override(std::shared_ptr<Tensor_attributes> const& value) {
+        inputs[input_names::N_override] = value;
+        return *this;
+    }
+
+    Matmul_fp8_attributes&
+    set_k_override(std::shared_ptr<Tensor_attributes> const& value) {
+        inputs[input_names::K_override] = value;
+        return *this;
+    }
+
+    Matmul_fp8_attributes&
+    set_padding(double const padding_val) {
+        padding_value = padding_val;
+        return *this;
+    }
+
+    double
+    get_padding() const {
+        return padding_value;
+    }
+};
+
+class Matmul_attributes : public Attributes<Matmul_attributes> {
+    friend class Attributes<Matmul_attributes>;
+    friend class MatmulNode;
+    friend class INode;
+
+    double padding_value = 0.0;
+
+   public:
+    enum class input_names { A, B, M_override, N_override, K_override };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { C };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Matmul_attributes, name, compute_data_type, inputs, outputs, padding_value)
+
+    Matmul_attributes&
+    clone_fp8_attributes(Matmul_fp8_attributes const& attributes) {
+        auto m_override = attributes.inputs.find(Matmul_fp8_attributes::input_names::M_override);
+        if (m_override != attributes.inputs.end()) {
+            set_m_override(m_override->second);
+        }
+        auto n_override = attributes.inputs.find(Matmul_fp8_attributes::input_names::N_override);
+        if (n_override != attributes.inputs.end()) {
+            set_n_override(n_override->second);
+        }
+        auto k_override = attributes.inputs.find(Matmul_fp8_attributes::input_names::K_override);
+        if (k_override != attributes.inputs.end()) {
+            set_k_override(k_override->second);
+        }
+
+        set_padding(attributes.get_padding());
+
+        return *this;
+    }
+
+    Matmul_attributes&
+    set_m_override(std::shared_ptr<Tensor_attributes> const& value) {
+        inputs[input_names::M_override] = value;
+        return *this;
+    }
+
+    Matmul_attributes&
+    set_n_override(std::shared_ptr<Tensor_attributes> const& value) {
+        inputs[input_names::N_override] = value;
+        return *this;
+    }
+
+    Matmul_attributes&
+    set_k_override(std::shared_ptr<Tensor_attributes> const& value) {
+        inputs[input_names::K_override] = value;
+        return *this;
+    }
+
+    Matmul_attributes&
+    set_padding(double const padding_val) {
+        padding_value = padding_val;
+        return *this;
+    }
+};
+
+class Pointwise_attributes : public Attributes<Pointwise_attributes> {
+    friend class Attributes<Pointwise_attributes>;
+    friend class PointwiseNode;
+    friend class SoftmaxNode;
+    friend class INode;
+
+    PointwiseMode_t mode = PointwiseMode_t::NOT_SET;
+
+    std::optional<int64_t> axis;
+
+    std::optional<float> relu_lower_clip;
+    std::optional<float> relu_upper_clip;
+    std::optional<float> relu_lower_clip_slope;
+
+    std::optional<float> swish_beta;
+    std::optional<float> elu_alpha;
+    std::optional<float> softplus_beta;
+
+   public:
+    enum class input_names { IN_0, IN_1, IN_2 };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { OUT_0 };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Pointwise_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   mode,
+                                   axis,
+                                   relu_lower_clip,
+                                   relu_upper_clip,
+                                   relu_lower_clip_slope,
+                                   swish_beta,
+                                   elu_alpha,
+                                   softplus_beta)
+
+    Pointwise_attributes&
+    set_mode(PointwiseMode_t const value) {
+        mode = value;
+        return *this;
+    }
+
+    std::optional<int64_t>
+    get_axis() const {
+        return axis;
+    }
+
+    Pointwise_attributes&
+    set_axis(int64_t const axis) {
+        this->axis = axis;
+        return *this;
+    }
+
+    Pointwise_attributes&
+    set_relu_lower_clip_slope(float const negative_slope) {
+        this->relu_lower_clip_slope = negative_slope;
+        return *this;
+    }
+
+    Pointwise_attributes&
+    set_relu_lower_clip(float const value) {
+        this->relu_lower_clip = value;
+        return *this;
+    }
+
+    Pointwise_attributes&
+    set_relu_upper_clip(float const value) {
+        this->relu_upper_clip = value;
+        return *this;
+    }
+
+    Pointwise_attributes&
+    set_swish_beta(float const value) {
+        this->swish_beta = value;
+        return *this;
+    }
+
+    Pointwise_attributes&
+    set_elu_alpha(float const value) {
+        this->elu_alpha = value;
+        return *this;
+    }
+
+    Pointwise_attributes&
+    set_softplus_beta(float const value) {
+        this->softplus_beta = value;
+        return *this;
+    }
+};
+
+class Instancenorm_backward_attributes : public Attributes<Instancenorm_backward_attributes> {
+    friend class Attributes<Instancenorm_backward_attributes>;
+    friend class DINNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX, DSCALE, DBIAS };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Instancenorm_backward_attributes, name, compute_data_type, inputs, outputs)
+
+    Instancenorm_backward_attributes&
+    set_saved_mean_and_inv_variance(std::shared_ptr<Tensor_attributes> mean,
+                                    std::shared_ptr<Tensor_attributes> inv_variance) {
+        inputs[Instancenorm_backward_attributes::input_names::MEAN]         = mean;
+        inputs[Instancenorm_backward_attributes::input_names::INV_VARIANCE] = inv_variance;
+        return *this;
+    }
+};
+
+class Layernorm_backward_attributes : public Attributes<Layernorm_backward_attributes> {
+    friend class Attributes<Layernorm_backward_attributes>;
+    friend class DLNNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE, EPSILON };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX, DSCALE, DBIAS };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Layernorm_backward_attributes, name, compute_data_type, inputs, outputs)
+
+    Layernorm_backward_attributes&
+    set_saved_mean_and_inv_variance(std::shared_ptr<Tensor_attributes> mean,
+                                    std::shared_ptr<Tensor_attributes> inv_variance) {
+        inputs[Layernorm_backward_attributes::input_names::MEAN]         = mean;
+        inputs[Layernorm_backward_attributes::input_names::INV_VARIANCE] = inv_variance;
+        return *this;
+    }
+};
+
+class Layernorm_attributes : public Attributes<Layernorm_attributes> {
+    friend class Attributes<Layernorm_attributes>;
+    friend class LayerNormNode;
+    friend class Graph;
+
+    NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
+
+   public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, MEAN, INV_VARIANCE };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Layernorm_attributes, name, compute_data_type, inputs, outputs, forward_phase)
+
+    Layernorm_attributes&
+    set_forward_phase(NormFwdPhase_t const value) {
+        forward_phase = value;
+        return *this;
+    }
+
+    Layernorm_attributes&
+    set_epsilon(std::shared_ptr<Tensor_attributes>& value) {
+        inputs[Layernorm_attributes::input_names::EPSILON] = value;
+        return *this;
+    }
+};
+
+class AdaLayernorm_attributes : public Attributes<AdaLayernorm_attributes> {
+    friend class Attributes<AdaLayernorm_attributes>;
+    friend class AdaLayerNormNode;
+    friend class Graph;
+
+    NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
+
+   public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, MEAN, INV_VARIANCE };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(AdaLayernorm_attributes, name, compute_data_type, inputs, outputs, forward_phase)
+
+    AdaLayernorm_attributes&
+    set_forward_phase(NormFwdPhase_t const value) {
+        forward_phase = value;
+        return *this;
+    }
+
+    AdaLayernorm_attributes&
+    set_epsilon(std::shared_ptr<Tensor_attributes> value) {
+        inputs[AdaLayernorm_attributes::input_names::EPSILON] = std::move(value);
+        return *this;
+    }
+};
+
+class AdaLayernorm_backward_attributes : public Attributes<AdaLayernorm_backward_attributes> {
+    friend class Attributes<AdaLayernorm_backward_attributes>;
+    friend class DAdaLayerNormNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE, EPSILON };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX, DSCALE, DBIAS };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(AdaLayernorm_backward_attributes, name, compute_data_type, inputs, outputs)
+
+    AdaLayernorm_backward_attributes&
+    set_saved_mean_and_inv_variance(std::shared_ptr<Tensor_attributes> mean,
+                                    std::shared_ptr<Tensor_attributes> inv_variance) {
+        inputs[AdaLayernorm_backward_attributes::input_names::MEAN]         = mean;
+        inputs[AdaLayernorm_backward_attributes::input_names::INV_VARIANCE] = inv_variance;
+        return *this;
+    }
+};
+
+class Instancenorm_attributes : public Attributes<Instancenorm_attributes> {
+    friend class Attributes<Instancenorm_attributes>;
+    friend class InstanceNormNode;
+    friend class Graph;
+
+    NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
+
+   public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, MEAN, INV_VARIANCE };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Instancenorm_attributes, name, compute_data_type, inputs, outputs, forward_phase)
+
+    Instancenorm_attributes&
+    set_forward_phase(NormFwdPhase_t const value) {
+        forward_phase = value;
+        return *this;
+    }
+
+    Instancenorm_attributes&
+    set_epsilon(std::shared_ptr<Tensor_attributes>& value) {
+        inputs[Instancenorm_attributes::input_names::EPSILON] = value;
+        return *this;
+    }
+};
+
+class Batchnorm_attributes : public Attributes<Batchnorm_attributes> {
+    friend class Attributes<Batchnorm_attributes>;
+    friend class BatchNormNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { X, SCALE, BIAS, PREV_RUNNING_MEAN, PREV_RUNNING_VAR, EPSILON, MOMENTUM };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    // Only special case where one of the inputs is a vector.
+    std::vector<std::shared_ptr<Tensor_attributes>> peer_stats;
+    enum class output_names { Y, MEAN, INV_VARIANCE, NEXT_RUNNING_MEAN, NEXT_RUNNING_VAR };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_attributes, name, compute_data_type, inputs, peer_stats, outputs)
+
+    Batchnorm_attributes&
+    set_previous_running_stats(std::shared_ptr<Tensor_attributes>& mean,
+                               std::shared_ptr<Tensor_attributes>& variance,
+                               std::shared_ptr<Tensor_attributes>& momentum) {
+        inputs[input_names::PREV_RUNNING_MEAN] = mean;
+        inputs[input_names::PREV_RUNNING_VAR]  = variance;
+        inputs[input_names::MOMENTUM]          = momentum;
+        return *this;
+    }
+
+    Batchnorm_attributes&
+    set_epsilon(std::shared_ptr<Tensor_attributes>& value) {
+        inputs[input_names::EPSILON] = value;
+        return *this;
+    }
+
+    Batchnorm_attributes&
+    set_peer_stats(std::vector<std::shared_ptr<Tensor_attributes>> const& input_peer_stats) {
+        peer_stats = input_peer_stats;
+        return *this;
+    }
+};
+
+class Batchnorm_inference_attributes : public Attributes<Batchnorm_inference_attributes> {
+    friend class Attributes<Batchnorm_inference_attributes>;
+    friend class BatchnormInferenceNode;
+    friend class Graph;
+
+   public:
+    enum class input_names { X, MEAN, INV_VARIANCE, SCALE, BIAS };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Batchnorm_inference_attributes, name, compute_data_type, inputs, outputs)
+};
+
+class Reduction_attributes : public Attributes<Reduction_attributes> {
+    friend class Attributes<Reduction_attributes>;
+    friend class ReductionNode;
+    friend class INode;
+
+    std::optional<ReductionMode_t> mode;
+    bool is_deterministic = false;
+
+   public:
+    enum class input_names { X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Reduction_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   mode,
+                                   is_deterministic)
+
+    std::optional<ReductionMode_t>
+    get_mode() const {
+        return mode;
+    }
+
+    Reduction_attributes&
+    set_mode(ReductionMode_t value) {
+        mode = value;
+        return *this;
+    }
+
+    bool
+    get_is_deterministic() const {
+        return is_deterministic;
+    }
+
+    Reduction_attributes&
+    set_is_deterministic(bool value) {
+        is_deterministic = value;
+        return *this;
+    }
+};
+
+class Rng_attributes : public Attributes<Rng_attributes> {
+    friend class Attributes<Rng_attributes>;
+    friend class RngNode;
+    friend class INode;
+
+    RngDistribution_t distribution = RngDistribution_t::NOT_SET;
+    std::vector<int64_t> dim       = {};
+    std::vector<int64_t> stride    = {};
+    std::optional<int64_t> seed;
+    std::optional<double> bernoulli_probability;
+
+   public:
+    enum class input_names { Seed, Offset };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rng_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   distribution,
+                                   dim,
+                                   stride,
+                                   seed,
+                                   bernoulli_probability)
+
+    std::vector<int64_t>
+    get_dim() const {
+        return dim;
+    }
+
+    auto
+    set_dim(std::vector<int64_t> const& value) -> Rng_attributes& {
+        dim = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_stride() const {
+        return stride;
+    }
+
+    auto
+    set_stride(std::vector<int64_t> const& value) -> Rng_attributes& {
+        stride = value;
+        return *this;
+    }
+
+    RngDistribution_t
+    get_distribution() const {
+        return distribution;
+    }
+
+    Rng_attributes&
+    set_distribution(RngDistribution_t value) {
+        distribution = value;
+        return *this;
+    }
+
+    std::optional<int64_t>
+    get_seed() const {
+        return seed;
+    }
+
+    Rng_attributes&
+    set_seed(std::optional<int64_t> value) {
+        seed = value;
+        return *this;
+    }
+
+    std::optional<double>
+    get_bernoulli_probability() const {
+        return bernoulli_probability;
+    }
+
+    Rng_attributes&
+    set_bernoulli_probability(std::optional<double> value) {
+        bernoulli_probability = value;
+        return *this;
+    }
+};
+
+class Resample_attributes : public Attributes<Resample_attributes> {
+    friend class Attributes<Resample_attributes>;
+    friend class ResampleNode;
+    friend class INode;
+
+    std::optional<bool> generate_index;
+    ResampleMode_t resample_mode;
+    PaddingMode_t padding_mode;
+    std::vector<cudnnFraction_t> pre_padding;
+    std::vector<cudnnFraction_t> post_padding;
+    std::vector<cudnnFraction_t> stride;
+    std::vector<cudnnFraction_t> window;
+
+   public:
+    enum class input_names { X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+
+    enum class output_names { Y, Index };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Resample_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   generate_index,
+                                   resample_mode,
+                                   padding_mode,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   window)
+
+    auto
+    set_resampling_mode(ResampleMode_t const& value) -> Resample_attributes& {
+        resample_mode = value;
+        return *this;
+    }
+
+    auto
+    set_padding_mode(PaddingMode_t const& value) -> Resample_attributes& {
+        padding_mode = value;
+        return *this;
+    }
+
+    auto
+    set_window(std::vector<int64_t> const& value) -> Resample_attributes& {
+        window.resize(value.size());
+        for (auto i = 0u; i < value.size(); i++) {
+            window[i].numerator   = value[i];
+            window[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    auto
+    set_window(std::vector<cudnnFraction_t> const& value) -> Resample_attributes& {
+        window = value;
+        return *this;
+    }
+
+    auto
+    set_stride(std::vector<int64_t> const& value) -> Resample_attributes& {
+        stride.resize(value.size());
+        for (auto i = 0u; i < value.size(); i++) {
+            stride[i].numerator   = value[i];
+            stride[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    auto
+    set_stride(std::vector<cudnnFraction_t> const& value) -> Resample_attributes& {
+        stride = value;
+        return *this;
+    }
+
+    auto
+    set_pre_padding(std::vector<int64_t> const& value) -> Resample_attributes& {
+        pre_padding.resize(value.size());
+        for (auto i = 0u; i < value.size(); i++) {
+            pre_padding[i].numerator   = value[i];
+            pre_padding[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    auto
+    set_pre_padding(std::vector<cudnnFraction_t> const& value) -> Resample_attributes& {
+        pre_padding = value;
+        return *this;
+    }
+
+    auto
+    set_post_padding(std::vector<int64_t> const& value) -> Resample_attributes& {
+        post_padding.resize(value.size());
+        for (auto i = 0u; i < value.size(); i++) {
+            post_padding[i].numerator   = value[i];
+            post_padding[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    auto
+    set_post_padding(std::vector<cudnnFraction_t> const& value) -> Resample_attributes& {
+        post_padding = value;
+        return *this;
+    }
+
+    auto
+    set_generate_index(bool const value) -> Resample_attributes& {
+        generate_index = value;
+        return *this;
+    }
+
+    [[deprecated]] auto
+    set_is_inference(bool const value) -> Resample_attributes& {
+        return set_generate_index(!value);
+    }
+};
+
+class Reshape_attributes : public Attributes<Reshape_attributes> {
+    friend class Attributes<Reshape_attributes>;
+    friend class ReshapeNode;
+    friend class INode;
+
+    std::vector<int64_t> dim    = {};
+    std::vector<int64_t> stride = {};
+
+   public:
+    enum class input_names { X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Reshape_attributes, name, compute_data_type, inputs, outputs, dim, stride)
+
+    std::vector<int64_t>
+    get_dim() const {
+        return dim;
+    }
+
+    auto
+    set_dim(std::vector<int64_t> const& value) -> Reshape_attributes& {
+        dim = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_stride() const {
+        return stride;
+    }
+
+    auto
+    set_stride(std::vector<int64_t> const& value) -> Reshape_attributes& {
+        stride = value;
+        return *this;
+    }
+};
+
+class Rmsnorm_attributes : public Attributes<Rmsnorm_attributes> {
+    friend class Attributes<Rmsnorm_attributes>;
+    friend class RMSNormNode;
+    friend class Graph;
+
+    NormFwdPhase_t forward_phase = NormFwdPhase_t::NOT_SET;
+
+   public:
+    enum class input_names { X, SCALE, BIAS, EPSILON };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, INV_VARIANCE };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rmsnorm_attributes, name, compute_data_type, inputs, outputs, forward_phase)
+
+    Rmsnorm_attributes&
+    set_forward_phase(NormFwdPhase_t const value) {
+        forward_phase = value;
+        return *this;
+    }
+
+    Rmsnorm_attributes&
+    set_bias(std::shared_ptr<Tensor_attributes>& value) {
+        inputs[Rmsnorm_attributes::input_names::BIAS] = value;
+        return *this;
+    }
+
+    Rmsnorm_attributes&
+    set_epsilon(std::shared_ptr<Tensor_attributes>& value) {
+        inputs[Rmsnorm_attributes::input_names::EPSILON] = value;
+        return *this;
+    }
+};
+
+class Rmsnorm_backward_attributes : public Attributes<Rmsnorm_backward_attributes> {
+    friend class Attributes<Rmsnorm_backward_attributes>;
+    friend class DRMSNormNode;
+    friend class Graph;
+
+    std::optional<bool> use_dbias;
+
+   public:
+    enum class input_names { DY, X, SCALE, INV_VARIANCE };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { DX, DSCALE, DBIAS };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Rmsnorm_backward_attributes, name, compute_data_type, inputs, outputs)
+
+    Rmsnorm_backward_attributes&
+    has_dbias(bool value) {
+        use_dbias = value;
+        return *this;
+    }
+};
+
+// class Scaled_dot_product_attention_attributes : public Operation {
+//    public:
+//     struct Inputs {
+//         std::shared_ptr<Tensor_attributes> Q;
+//         std::shared_ptr<Tensor_attributes> K;
+//         std::shared_ptr<Tensor_attributes> Attn_scale;
+//         std::shared_ptr<Tensor_attributes> Bias;  // Optional bias after bmm1
+//         std::shared_ptr<Tensor_attributes> V;
+//         std::shared_ptr<Tensor_attributes> SEQ_LEN_Q;
+//         std::shared_ptr<Tensor_attributes> SEQ_LEN_KV;
+//         std::shared_ptr<Tensor_attributes> Mask;
+//         std::shared_ptr<Tensor_attributes> Dropout_mask;
+//         std::shared_ptr<Tensor_attributes> Dropout_scale;
+//     } inputs;
+
+//     struct Outputs {
+//         std::shared_ptr<Tensor_attributes> O;
+//         std::shared_ptr<Tensor_attributes>
+//             S;  // softmax output dumped when is_inference false. Users first need to check whether its nullptr.
+//     } outputs;
+
+//     std::optional<bool> is_inference;
+//     bool padding_mask = false;
+//     bool causal_mask  = false;
+//     std::optional<float> dropout_probability;
+//     int64_t seed;
+//     float dropout_scale = 1.f;
+
+//    public:
+//     Scaled_dot_product_attention_attributes() : Operation(Tag::Scaled_dot_product_attention), is_inference(false) {}
+
+//     Scaled_dot_product_attention_attributes&
+//     set_is_inference(bool const value) {
+//         is_inference = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_seq_len_q(std::shared_ptr<Tensor_attributes> value) {
+//         inputs.SEQ_LEN_Q = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_seq_len_kv(std::shared_ptr<Tensor_attributes> value) {
+//         inputs.SEQ_LEN_KV = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_padding_mask(bool const value) {
+//         padding_mask = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_causal_mask(bool const value) {
+//         causal_mask = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_attn_scale(std::shared_ptr<Tensor_attributes> value) {
+//         inputs.Attn_scale = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_bias(std::shared_ptr<Tensor_attributes> bias) {
+//         inputs.Bias = bias;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_dropout(float const probability, int64_t const seed_) {
+//         dropout_probability = probability;
+//         seed                = seed_;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_dropout(std::shared_ptr<Tensor_attributes> mask, std::shared_ptr<Tensor_attributes> scale) {
+//         inputs.Dropout_mask  = mask;
+//         inputs.Dropout_scale = scale;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_compute_data_type(DataType_t const value) {
+//         compute_data_type = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     set_name(std::string const& value) {
+//         name = value;
+//         return *this;
+//     }
+
+//     Scaled_dot_product_attention_attributes&
+//     fill_from_context(detail::Context const& context) {
+//         // Fill node's tensors
+//         inputs.Q->fill_from_context(context);
+//         inputs.K->fill_from_context(context);
+//         inputs.V->fill_from_context(context);
+//         inputs.SEQ_LEN_Q->fill_from_context(context);
+//         inputs.SEQ_LEN_KV->fill_from_context(context);
+//         outputs.O->fill_from_context(context);
+
+//         // Fill this node
+//         if (get_compute_data_type() == DataType_t::NOT_SET) {
+//             set_compute_data_type(context.get_compute_data_type());
+//         }
+//         return *this;
+//     }
+// };
+template <typename DerivedClass>
+class SDPANodeBase;
+class CompositeSDPANode;
+class UnifiedSDPANode;
+
+class SDPA_attributes : public Attributes<SDPA_attributes> {
+    friend class Attributes<SDPA_attributes>;
+    friend class SDPANodeBase<CompositeSDPANode>;
+    friend class CompositeSDPANode;
+    friend class SDPANodeBase<UnifiedSDPANode>;
+    friend class UnifiedSDPANode;
+    friend class Graph;
+
+    using Tensor_t = std::shared_ptr<Tensor_attributes>;
+    using Graph_t  = std::shared_ptr<Graph>;
+
+    using AttentionScoreModifier_t =
+        std::function<Tensor_t(std::shared_ptr<Graph>, std::shared_ptr<Tensor_attributes>)>;
+
+    std::optional<bool> generate_stats;
+    bool alibi_mask   = false;
+    bool padding_mask = false;
+    std::optional<int64_t> left_bound;
+    std::optional<int64_t> right_bound;
+    DiagonalAlignment_t diagonal_alignment = DiagonalAlignment_t::TOP_LEFT;
+    std::optional<float> dropout_probability;
+    std::optional<float> attn_scale_value;
+    std::optional<int> max_seq_len_kv;
+    AttentionScoreModifier_t attention_score_modifier = nullptr;
+    DataType_t mma_core_mode                          = DataType_t::NOT_SET;
+
+    // Deprecated fields for backward compatibility with SDPA_fp8_attributes
+    bool causal_mask              = false;
+    bool causal_mask_bottom_right = false;
+
+    AttentionImplementation_t implementation = AttentionImplementation_t::AUTO;
+
+    bool
+    has_causal_like_masking() const {
+        return right_bound.has_value();
+    }
+
+    bool
+    has_causal_mask_bottom_right() const {
+        return right_bound.has_value() && diagonal_alignment == DiagonalAlignment_t::BOTTOM_RIGHT;
+    }
+
+   public:
+    enum class input_names {
+        Q,
+        K,
+        V,
+        Attn_scale,
+        Bias,
+        SEQ_LEN_Q,
+        SEQ_LEN_KV,
+        Seed,
+        Offset,
+        Dropout_mask,
+        Dropout_scale,
+        Page_table_K,
+        Page_table_V,
+        Block_mask,
+        // FP8-specific scaling inputs
+        Descale_Q,
+        Descale_K,
+        Descale_V,
+        Descale_S,
+        Scale_S,
+        Scale_O,
+        SINK_TOKEN,
+    };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { O, Stats, Max, Sum_exp, RNG_DUMP, Amax_S, Amax_O };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    // Convenience struct for named access to SDPA outputs
+    struct SDPA_outputs {
+        std::shared_ptr<Tensor_attributes> O;         ///< Main attention output tensor
+        std::shared_ptr<Tensor_attributes> Stats;     ///< Statistics/softmax output (when generate_stats=true)
+        std::shared_ptr<Tensor_attributes> Max;       ///< Max output tensor
+        std::shared_ptr<Tensor_attributes> Sum_exp;   ///< Sum_exp output tensor
+        std::shared_ptr<Tensor_attributes> RNG_DUMP;  ///< Random number generator dump for dropout
+                                                      ///< check why we don't return RNG_DUMP this way
+        std::shared_ptr<Tensor_attributes> Amax_S;    ///< FP8 absolute maximum for attention scores
+        std::shared_ptr<Tensor_attributes> Amax_O;    ///< FP8 absolute maximum for output tensor
+    };
+
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   generate_stats,
+                                   alibi_mask,
+                                   padding_mask,
+                                   dropout_probability,
+                                   attn_scale_value,
+                                   max_seq_len_kv,
+                                   mma_core_mode,
+                                   left_bound,
+                                   right_bound,
+                                   diagonal_alignment,
+                                   causal_mask,
+                                   causal_mask_bottom_right,
+                                   implementation)
+
+    SDPA_attributes&
+    set_generate_stats(bool const value) {
+        generate_stats = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_logit_max(std::shared_ptr<Tensor_attributes> value) {
+        outputs[SDPA_attributes::output_names::Max] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_score_sum_exp(std::shared_ptr<Tensor_attributes> value) {
+        outputs[SDPA_attributes::output_names::Sum_exp] = std::move(value);
+        return *this;
+    }
+
+    [[deprecated]] SDPA_attributes&
+    set_is_inference(bool const value) {
+        return set_generate_stats(!value);
+    }
+
+    SDPA_attributes&
+    set_attn_scale(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::Attn_scale] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_attn_scale(float const value) {
+        attn_scale_value = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_bias(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::Bias] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_block_mask(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::Block_mask] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_alibi_mask(bool const value) {
+        alibi_mask = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_padding_mask(bool const value) {
+        padding_mask = value;
+        return *this;
+    }
+
+    // Internal function - do not use directly in application code
+    SDPA_attributes&
+    _set_mma_core_mode(DataType_t const value) {
+        mma_core_mode = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_seq_len_q(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::SEQ_LEN_Q] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_seq_len_kv(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::SEQ_LEN_KV] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_diagonal_alignment(DiagonalAlignment_t const alignment) {
+        diagonal_alignment = alignment;
+        return *this;
+    }
+
+    // Sets the diagonal position to top left and
+    // calls set_diagonal_band_right_bound(0) if no right_bound was specified
+    // TODO: Deprecate
+    SDPA_attributes&
+    set_causal_mask(bool const value) {
+        if (value) {
+            set_diagonal_alignment(DiagonalAlignment_t::TOP_LEFT);
+            if (!right_bound.has_value()) {
+                set_diagonal_band_right_bound(0);
+            }
+        }
+        causal_mask = value;
+        return *this;
+    }
+
+    // Sets the diagonal position to the bottom right (on a per-sequence basis)
+    // and calls set_diagonal_band_right_bound(0) if no right_bound was specified
+    // TODO: Deprecate
+    SDPA_attributes&
+    set_causal_mask_bottom_right(bool const value) {
+        if (value) {
+            set_diagonal_alignment(DiagonalAlignment_t::BOTTOM_RIGHT);
+            if (!right_bound.has_value()) {
+                set_diagonal_band_right_bound(0);
+            }
+        }
+        causal_mask_bottom_right = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_score_mod(AttentionScoreModifier_t fn) {
+        attention_score_modifier = std::move(fn);
+        return *this;
+    }
+
+    // calls set_diagonal_band_left_bound(value)
+    // TODO: Deprecate
+    SDPA_attributes&
+    set_sliding_window_length(int const value) {
+        return set_diagonal_band_left_bound(value);
+    }
+
+    SDPA_attributes&
+    set_diagonal_band_left_bound(int const value) {
+        left_bound = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_diagonal_band_right_bound(int const value) {
+        right_bound = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_dropout(float const probability,
+                std::shared_ptr<Tensor_attributes> seed,
+                std::shared_ptr<Tensor_attributes> offset) {
+        dropout_probability                          = probability;
+        inputs[SDPA_attributes::input_names::Seed]   = std::move(seed);
+        inputs[SDPA_attributes::input_names::Offset] = std::move(offset);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_dropout(std::shared_ptr<Tensor_attributes> mask, std::shared_ptr<Tensor_attributes> scale) {
+        inputs[SDPA_attributes::input_names::Dropout_mask]  = std::move(mask);
+        inputs[SDPA_attributes::input_names::Dropout_scale] = std::move(scale);
+        return *this;
+    }
+
+    // For debugging purposes only.
+    SDPA_attributes&
+    set_rng_dump(std::shared_ptr<Tensor_attributes> value) {
+        outputs[SDPA_attributes::output_names::RNG_DUMP] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_paged_attention_k_table(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::Page_table_K] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_paged_attention_v_table(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::Page_table_V] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_paged_attention_max_seq_len_kv(int const value) {
+        max_seq_len_kv = value;
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_sink_token(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_attributes::input_names::SINK_TOKEN] = std::move(value);
+        return *this;
+    }
+
+    SDPA_attributes&
+    set_implementation(AttentionImplementation_t value) {
+        implementation = value;
+        return *this;
+    }
+
+    // Implementation is in sdpa_support_surface.h
+    error_t
+    validate_sdpa_support_surface(const detail::Context& context, int64_t s_kv, bool is_paged_k, bool is_paged_v) const;
+
+    // Internal function - do not use directly in application code
+    void
+    _auto_select_implementation(const detail::Context& context) {
+        if (verify_sdpa_support_surface_for_implementation(context, AttentionImplementation_t::UNIFIED).is_good()) {
+            implementation = AttentionImplementation_t::UNIFIED;
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: Auto-selected SDPA implementation UNIFIED");
+        } else if (verify_sdpa_support_surface_for_implementation(context, AttentionImplementation_t::COMPOSITE)
+                       .is_good()) {
+            implementation = AttentionImplementation_t::COMPOSITE;
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: Auto-selected SDPA implementation COMPOSITE");
+        } else {
+            // Leave `implementation` with its previous value (usually AUTO).
+            CUDNN_FE_LOG_LABEL_ENDL("ERROR: No suitable SDPA implementation for given SDPA_attributes");
+        }
+    }
+
+   private:
+    // Check whether implementation `impl` supports the requested features. `impl` must not be AUTO.
+    // (The `implementation` member variable is ignored.)
+    error_t
+    verify_sdpa_support_surface_for_implementation(const detail::Context& context,
+                                                   AttentionImplementation_t impl) const;
+};
+
+// Type alias for backward compatibility - SDPA_fp8_attributes is now an alias to SDPA_attributes
+// All FP8 functionality is unified in SDPA_attributes with the mma_core_mode field
+using SDPA_fp8_attributes = SDPA_attributes;
+
+class SDPA_backward_attributes : public Attributes<SDPA_backward_attributes> {
+    friend class Attributes<SDPA_backward_attributes>;
+    friend class CompositeSDPABackwardNode;
+    friend class Graph;
+    using Tensor_t = std::shared_ptr<Tensor_attributes>;
+    using Graph_t  = std::shared_ptr<Graph>;
+
+    using AttentionScoreModifier_t =
+        std::function<Tensor_t(std::shared_ptr<Graph>, std::shared_ptr<Tensor_attributes>)>;
+
+    bool alibi_mask   = false;
+    bool padding_mask = false;
+    std::optional<int64_t> left_bound;
+    std::optional<int64_t> right_bound;
+    DiagonalAlignment_t diagonal_alignment = DiagonalAlignment_t::TOP_LEFT;
+
+    std::optional<float> dropout_probability;
+    std::optional<float> attn_scale_value;
+
+    std::optional<int64_t> max_total_seq_len_q;
+    std::optional<int64_t> max_total_seq_len_kv;
+
+    bool is_deterministic_algorithm                         = false;
+    AttentionScoreModifier_t attention_score_modifier       = nullptr;
+    AttentionScoreModifier_t attention_score_modifier_bprop = nullptr;
+
+    bool
+    has_causal_like_masking() const {
+        return right_bound.has_value();
+    }
+
+    bool
+    has_causal_mask_bottom_right() const {
+        return right_bound.has_value() && diagonal_alignment == DiagonalAlignment_t::BOTTOM_RIGHT;
+    }
+
+   public:
+    enum class input_names {
+        Q,
+        K,
+        V,
+        O,
+        dO,
+        Stats,
+        Attn_scale,
+        Bias,
+        SEQ_LEN_Q,
+        SEQ_LEN_KV,
+        Seed,
+        Offset,
+        Dropout_mask,
+        Dropout_scale,
+        Dropout_scale_inv,
+        SINK_TOKEN,
+    };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { dQ, dK, dV, dBias, RNG_DUMP, DSINK_TOKEN };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_backward_attributes,
+                                   name,
+                                   inputs,
+                                   outputs,
+                                   alibi_mask,
+                                   padding_mask,
+                                   dropout_probability,
+                                   attn_scale_value,
+                                   left_bound,
+                                   right_bound,
+                                   diagonal_alignment,
+                                   max_total_seq_len_q,
+                                   max_total_seq_len_kv,
+                                   is_deterministic_algorithm)
+
+    SDPA_backward_attributes&
+    set_attn_scale(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_backward_attributes::input_names::Attn_scale] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_attn_scale(float const value) {
+        attn_scale_value = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_bias(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_backward_attributes::input_names::Bias] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_dbias(std::shared_ptr<Tensor_attributes> value) {
+        outputs[SDPA_backward_attributes::output_names::dBias] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_alibi_mask(bool const value) {
+        alibi_mask = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_padding_mask(bool const value) {
+        padding_mask = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_score_mod(AttentionScoreModifier_t fn) {
+        attention_score_modifier = std::move(fn);
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_score_mod_bprop(AttentionScoreModifier_t fn) {
+        attention_score_modifier_bprop = std::move(fn);
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_seq_len_q(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_backward_attributes::input_names::SEQ_LEN_Q] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_seq_len_kv(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_backward_attributes::input_names::SEQ_LEN_KV] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_max_total_seq_len_q(int64_t const value) {
+        max_total_seq_len_q = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_max_total_seq_len_kv(int64_t const value) {
+        max_total_seq_len_kv = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_diagonal_alignment(DiagonalAlignment_t const alignment) {
+        diagonal_alignment = alignment;
+        return *this;
+    }
+
+    // Sets the diagonal position to top left and
+    // calls set_diagonal_band_right_bound(0) if no right_bound was specified
+    // TODO: Deprecate
+    SDPA_backward_attributes&
+    set_causal_mask(bool const value) {
+        if (value) {
+            set_diagonal_alignment(DiagonalAlignment_t::TOP_LEFT);
+            if (!right_bound.has_value()) {
+                set_diagonal_band_right_bound(0);
+            }
+        }
+        return *this;
+    }
+
+    // Sets the diagonal position to the bottom right (on a per-sequence basis)
+    // and calls set_diagonal_band_right_bound(0) if no right_bound was specified
+    // TODO: Deprecate
+    SDPA_backward_attributes&
+    set_causal_mask_bottom_right(bool const value) {
+        if (value) {
+            set_diagonal_alignment(DiagonalAlignment_t::BOTTOM_RIGHT);
+            if (!right_bound.has_value()) {
+                set_diagonal_band_right_bound(0);
+            }
+        }
+        return *this;
+    }
+
+    // calls set_diagonal_band_left_bound(value)
+    // TODO: Deprecate
+    SDPA_backward_attributes&
+    set_sliding_window_length(int const value) {
+        return set_diagonal_band_left_bound(value);
+    }
+
+    SDPA_backward_attributes&
+    set_diagonal_band_left_bound(int const value) {
+        left_bound = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_diagonal_band_right_bound(int const value) {
+        right_bound = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_dropout(float const probability,
+                std::shared_ptr<Tensor_attributes> seed,
+                std::shared_ptr<Tensor_attributes> offset) {
+        dropout_probability                                   = probability;
+        inputs[SDPA_backward_attributes::input_names::Seed]   = seed;
+        inputs[SDPA_backward_attributes::input_names::Offset] = offset;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_dropout(std::shared_ptr<Tensor_attributes> mask,
+                std::shared_ptr<Tensor_attributes> scale,
+                std::shared_ptr<Tensor_attributes> scale_inv) {
+        inputs[SDPA_backward_attributes::input_names::Dropout_mask]      = mask;
+        inputs[SDPA_backward_attributes::input_names::Dropout_scale]     = scale;
+        inputs[SDPA_backward_attributes::input_names::Dropout_scale_inv] = scale_inv;
+        return *this;
+    }
+
+    // For debugging purposes only.
+    SDPA_backward_attributes&
+    set_rng_dump(std::shared_ptr<Tensor_attributes> value) {
+        outputs[SDPA_backward_attributes::output_names::RNG_DUMP] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_deterministic_algorithm(bool const value) {
+        is_deterministic_algorithm = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_sink_token(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_backward_attributes::input_names::SINK_TOKEN] = value;
+        return *this;
+    }
+
+    SDPA_backward_attributes&
+    set_dsink_token(std::shared_ptr<Tensor_attributes> value) {
+        outputs[SDPA_backward_attributes::output_names::DSINK_TOKEN] = value;
+        return *this;
+    }
+};
+
+class SDPA_fp8_backward_attributes : public Attributes<SDPA_fp8_backward_attributes> {
+    friend class Attributes<SDPA_fp8_backward_attributes>;
+    friend class SDPAFP8BackwardNode;
+    friend class Graph;
+
+    bool padding_mask               = false;
+    bool causal_mask                = false;
+    bool causal_mask_bottom_right   = false;
+    bool is_deterministic_algorithm = false;
+
+    std::optional<float> dropout_probability;
+    std::optional<float> attn_scale_value;
+
+   public:
+    enum class input_names {
+        Q,
+        K,
+        V,
+        O,
+        dO,
+        Stats,
+        Attn_scale,
+        Bias,
+        SEQ_LEN_Q,
+        SEQ_LEN_KV,
+        Seed,
+        Offset,
+        Dropout_mask,
+        Dropout_scale,
+        Dropout_scale_inv,
+
+        Descale_Q,
+        Descale_K,
+        Descale_V,
+        Descale_O,
+        Descale_dO,
+        Descale_S,
+        Descale_dP,
+        Scale_dQ,
+        Scale_dK,
+        Scale_dV,
+        Scale_S,
+        Scale_dP,
+    };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+
+    enum class output_names { dQ, dK, dV, Amax_dQ, Amax_dK, Amax_dV, Amax_dP };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(SDPA_fp8_backward_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   padding_mask,
+                                   causal_mask,
+                                   dropout_probability,
+                                   causal_mask_bottom_right,
+                                   attn_scale_value,
+                                   is_deterministic_algorithm)
+
+    SDPA_fp8_backward_attributes&
+    set_attn_scale(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_fp8_backward_attributes::input_names::Attn_scale] = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_attn_scale(float const value) {
+        attn_scale_value = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_bias(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_fp8_backward_attributes::input_names::Bias] = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_padding_mask(bool const value) {
+        padding_mask = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_seq_len_q(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_fp8_backward_attributes::input_names::SEQ_LEN_Q] = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_seq_len_kv(std::shared_ptr<Tensor_attributes> value) {
+        inputs[SDPA_fp8_backward_attributes::input_names::SEQ_LEN_KV] = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_causal_mask(bool const value) {
+        causal_mask = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_causal_mask_bottom_right(bool const value) {
+        causal_mask_bottom_right = value;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_dropout(float const probability,
+                std::shared_ptr<Tensor_attributes> seed,
+                std::shared_ptr<Tensor_attributes> offset) {
+        dropout_probability                                       = probability;
+        inputs[SDPA_fp8_backward_attributes::input_names::Seed]   = seed;
+        inputs[SDPA_fp8_backward_attributes::input_names::Offset] = offset;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_dropout(std::shared_ptr<Tensor_attributes> mask,
+                std::shared_ptr<Tensor_attributes> scale,
+                std::shared_ptr<Tensor_attributes> scale_inv) {
+        inputs[SDPA_fp8_backward_attributes::input_names::Dropout_mask]      = mask;
+        inputs[SDPA_fp8_backward_attributes::input_names::Dropout_scale]     = scale;
+        inputs[SDPA_fp8_backward_attributes::input_names::Dropout_scale_inv] = scale_inv;
+        return *this;
+    }
+
+    SDPA_fp8_backward_attributes&
+    set_deterministic_algorithm(bool const value) {
+        is_deterministic_algorithm = value;
+        return *this;
+    }
+};
+
+using Scaled_dot_product_flash_attention_attributes [[deprecated]]          = SDPA_attributes;
+using Scaled_dot_product_flash_attention_backward_attributes [[deprecated]] = SDPA_backward_attributes;
+
+class Softmax_attributes : public Attributes<Softmax_attributes> {
+    friend class Attributes<Softmax_attributes>;
+    friend class SoftmaxNode;
+    friend class INode;
+
+   public:
+    enum class input_names { P, SINK };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { S, Stats, Max, Sum_exp };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Softmax_attributes, name, compute_data_type, inputs, outputs)
+
+    Softmax_attributes&
+    set_sink(std::shared_ptr<Tensor_attributes> value) {
+        inputs[Softmax_attributes::input_names::SINK] = value;
+        return *this;
+    }
+};
+
+class Conv_wgrad_attributes : public Attributes<Conv_wgrad_attributes> {
+    friend class Attributes<Conv_wgrad_attributes>;
+    friend class WgradNode;
+    friend class Graph;
+
+    std::vector<int64_t> pre_padding;
+    std::vector<int64_t> post_padding;
+    std::vector<int64_t> stride;
+    std::vector<int64_t> dilation;
+    ConvolutionMode_t math_mode = ConvolutionMode_t::CROSS_CORRELATION;
+
+   public:
+    enum class input_names { DY, X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+
+    enum class output_names { DW };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Conv_wgrad_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   pre_padding,
+                                   post_padding,
+                                   stride,
+                                   dilation,
+                                   math_mode)
+
+    ConvolutionMode_t
+    get_convolution_mode() const {
+        return math_mode;
+    }
+
+    std::vector<int64_t>
+    get_pre_padding() const {
+        return pre_padding;
+    }
+
+    std::vector<int64_t>
+    get_post_padding() const {
+        return post_padding;
+    }
+
+    Conv_wgrad_attributes&
+    set_convolution_mode(ConvolutionMode_t mode_) {
+        math_mode = mode_;
+        ;
+        return *this;
+    }
+
+    Conv_wgrad_attributes&
+    set_padding(std::vector<int64_t> value) {
+        pre_padding  = value;
+        post_padding = value;
+        return *this;
+    }
+
+    Conv_wgrad_attributes&
+    set_pre_padding(std::vector<int64_t> value) {
+        pre_padding = value;
+        return *this;
+    }
+
+    Conv_wgrad_attributes&
+    set_post_padding(std::vector<int64_t> value) {
+        post_padding = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_stride() const {
+        return stride;
+    }
+
+    Conv_wgrad_attributes&
+    set_stride(std::vector<int64_t> value) {
+        stride = value;
+        return *this;
+    }
+
+    std::vector<int64_t>
+    get_dilation() const {
+        return dilation;
+    }
+
+    Conv_wgrad_attributes&
+    set_dilation(std::vector<int64_t> value) {
+        dilation = value;
+        return *this;
+    }
+};
+
+class Slice_attributes : public Attributes<Slice_attributes> {
+    friend class Attributes<Slice_attributes>;
+    friend class SliceNode;
+    friend class INode;
+
+    std::vector<std::pair<int64_t, int64_t>> slices;
+
+   public:
+    enum class input_names { X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Slice_attributes, name, compute_data_type, inputs, outputs, slices)
+
+    Slice_attributes&
+    set_slices(std::vector<std::pair<int64_t, int64_t>> const value) {
+        slices = value;
+        return *this;
+    }
+
+    int64_t
+    get_offset() const {
+        auto& input             = inputs.at(input_names::X);
+        auto const input_stride = input->get_stride();
+
+        int64_t offset = 0;
+
+        // Get number of elements to skip
+        for (size_t i = 0; i < slices.size(); ++i) {
+            offset += slices[i].first * input_stride[i];
+        }
+
+        // multiply by element size to get offset in bytes
+        offset *= detail::get_data_type_size(input->get_data_type());
+        return offset;
+    }
+};
+
+class PagedCacheLoad_attributes : public Attributes<PagedCacheLoad_attributes> {
+    friend class Attributes<PagedCacheLoad_attributes>;
+    friend class PagedCacheLoadNode;
+    friend class INode;
+
+   public:
+    enum class input_names { container, seqLen, pageTable };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { yOut };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(PagedCacheLoad_attributes, name, compute_data_type, inputs, outputs)
+};
+
+class Block_scale_quantize_attributes : public Attributes<Block_scale_quantize_attributes> {
+    friend class Attributes<Block_scale_quantize_attributes>;
+    friend class BlockScaleQuantizeNode;
+    friend class Graph;
+
+    std::optional<int32_t> block_size;
+    std::optional<int64_t> axis;
+    bool transpose = false;
+
+   public:
+    enum class input_names { X };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y, scale };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Block_scale_quantize_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   block_size,
+                                   axis)
+
+    Block_scale_quantize_attributes&
+    set_block_size(int32_t const value) {
+        block_size = value;
+        return *this;
+    }
+
+    Block_scale_quantize_attributes&
+    set_axis(int64_t const value) {
+        axis = value;
+        return *this;
+    }
+
+    Block_scale_quantize_attributes&
+    set_transpose(bool const value) {
+        transpose = value;
+        return *this;
+    }
+};
+
+class Block_scale_dequantize_attributes : public Attributes<Block_scale_dequantize_attributes> {
+    friend class Attributes<Block_scale_dequantize_attributes>;
+    friend class BlockScaleDequantizeNode;
+    friend class Graph;
+
+    std::vector<int32_t> block_size;
+    bool is_negative_scale;
+
+   public:
+    enum class input_names { X, scale };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Block_scale_dequantize_attributes,
+                                   name,
+                                   compute_data_type,
+                                   inputs,
+                                   outputs,
+                                   block_size,
+                                   is_negative_scale)
+
+    Block_scale_dequantize_attributes&
+    set_block_size(int32_t const value, int32_t idx = 0) {
+        if (idx < 0) {
+            return *this;
+        }
+        if (static_cast<int32_t>(block_size.size()) < idx + 1) {
+            block_size.resize(idx + 1, 1);
+        }
+        block_size[idx] = value;
+        return *this;
+    }
+
+    Block_scale_dequantize_attributes&
+    set_block_size(const int32_t* values, int32_t len = 1) {
+        if (len < 1) {
+            return *this;
+        }
+        if (static_cast<int32_t>(block_size.size()) < len) {
+            block_size.resize(len);
+        }
+        std::copy(values, values + len, block_size.begin());
+        return *this;
+    }
+
+    Block_scale_dequantize_attributes&
+    set_block_size(const std::vector<int32_t>& values) {
+        block_size = values;
+        return *this;
+    }
+
+    bool
+    get_is_negative_scale() const {
+        return is_negative_scale;
+    }
+
+    Block_scale_dequantize_attributes&
+    set_is_negative_scale(bool value) {
+        is_negative_scale = value;
+        return *this;
+    }
+};
+
+#if 0
+class Concatenate_string {
+    friend class Attributes<Concatenate_attributes>;
+    friend class ConcatenateNode;
+    friend class Graph;
+public:
+std::string str;
+NLOHMANN_DEFINE_TYPE_INTRUSIVE(Concatenate_string, str)
+};
+#endif
+
+class Concatenate_attributes : public Attributes<Concatenate_attributes> {
+    friend class Attributes<Concatenate_attributes>;
+    friend class ConcatenateNode;
+    friend class Graph;
+
+    std::optional<int64_t> axis;
+    std::optional<int64_t> in_place_index;
+
+   public:
+    std::vector<std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Y };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Concatenate_attributes, name, inputs, outputs, axis, in_place_index)
+
+    Concatenate_attributes&
+    set_axis(int64_t const value) {
+        axis = value;
+        return *this;
+    }
+
+    Concatenate_attributes&
+    set_in_place_index(int64_t const value) {
+        in_place_index = value;
+        return *this;
+    }
+};
+
+class Moe_grouped_matmul_attributes : public Attributes<Moe_grouped_matmul_attributes> {
+    friend class Attributes<Moe_grouped_matmul_attributes>;
+    friend class MoeGroupedMatmulNode;
+    friend class Graph;
+
+    MoeGroupedMatmulMode_t mode = MoeGroupedMatmulMode_t::NONE;
+
+    int32_t top_k = 0;
+
+   public:
+    enum class input_names { Token, Weight, FirstTokenOffset, TokenIndex, TokenKs };
+    std::unordered_map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
+    enum class output_names { Output };
+    std::unordered_map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
+    NLOHMANN_DEFINE_TYPE_INTRUSIVE(Moe_grouped_matmul_attributes, name, inputs, outputs, mode, top_k)
+
+    Moe_grouped_matmul_attributes&
+    set_mode(MoeGroupedMatmulMode_t mode) {
+        this->mode = mode;
+        return *this;
+    }
+
+    Moe_grouped_matmul_attributes&
+    set_top_k(int32_t top_k) {
+        this->top_k = top_k;
+        return *this;
+    }
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/knobs.h b/third_party/cudnn-frontend/include/cudnn_frontend/knobs.h
new file mode 100644
index 00000000..ee4ee4c4
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/knobs.h
@@ -0,0 +1,228 @@
+#pragma once
+
+namespace cudnn_frontend {
+
+enum class KnobType_t {
+    NOT_SET,
+
+    SWIZZLE,
+    TILE_SIZE,
+    EDGE,
+    MULTIPLY,
+    SPLIT_K_BUF,
+    TILEK,
+    STAGES,
+    REDUCTION_MODE,
+    SPLIT_K_SLC,
+    IDX_MODE,
+    SPECFILT,
+    KERNEL_CFG,
+    WORKSPACE,
+    TILE_CGA_M,
+    TILE_CGA_N,
+    BLOCK_SIZE,
+    OCCUPANCY,
+    ARRAY_SIZE_PER_THREAD,
+    SPLIT_COLS,
+    TILE_ROWS,
+    TILE_COLS,
+    LOAD_SIZE,
+    CTA_COUNT,
+    STREAM_K,
+    SPLIT_P_SLC,
+    TILE_M,
+    TILE_N,
+    WARP_SPEC_CFG,
+};
+
+class Knob {
+   public:
+    KnobType_t type  = KnobType_t::NOT_SET;
+    int64_t maxValue = 0;
+    int64_t minValue = 0;
+    int64_t stride   = 0;
+
+    Knob(KnobType_t type, int64_t max, int64_t min, int64_t str)
+        : type(type), maxValue(max), minValue(min), stride(str) {}
+};
+
+static inline cudnnStatus_t
+convert_to_backend_knob_type(KnobType_t const knob_type, cudnnBackendKnobType_t& cudnn_knob_type) {
+    switch (knob_type) {
+        case KnobType_t::SWIZZLE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_SWIZZLE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILE_SIZE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_SIZE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::EDGE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_EDGE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::MULTIPLY:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_MULTIPLY;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::SPLIT_K_BUF:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_SPLIT_K_BUF;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILEK:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILEK;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::STAGES:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_STAGES;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::REDUCTION_MODE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_REDUCTION_MODE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::SPLIT_K_SLC:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_SPLIT_K_SLC;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::IDX_MODE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_IDX_MODE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::SPECFILT:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_SPECFILT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::KERNEL_CFG:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_KERNEL_CFG;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::WORKSPACE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_WORKSPACE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#if (CUDNN_VERSION >= 8600)
+        case KnobType_t::TILE_CGA_M:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_CGA_M;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILE_CGA_N:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_CGA_N;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#if (CUDNN_VERSION >= 8800)
+        case KnobType_t::BLOCK_SIZE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_BLOCK_SIZE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#if (CUDNN_VERSION >= 8900)
+        case KnobType_t::OCCUPANCY:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_OCCUPANCY;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::ARRAY_SIZE_PER_THREAD:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#if (CUDNN_VERSION >= 8905)
+        case KnobType_t::SPLIT_COLS:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_SPLIT_COLS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILE_ROWS:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_ROWS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILE_COLS:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_COLS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::LOAD_SIZE:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_LOAD_SIZE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#if (CUDNN_VERSION >= 90700)
+        case KnobType_t::CTA_COUNT:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_CTA_COUNT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::STREAM_K:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_STREAM_K;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::SPLIT_P_SLC:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_SPLIT_P_SLC;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILE_M:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_M;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::TILE_N:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_TILE_N;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case KnobType_t::WARP_SPEC_CFG:
+            cudnn_knob_type = CUDNN_KNOB_TYPE_WARP_SPEC_CFG;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+inline KnobType_t
+convert_from_backend_knob_type(cudnnBackendKnobType_t cudnn_knob_type) {
+    switch (cudnn_knob_type) {
+        case CUDNN_KNOB_TYPE_SWIZZLE:
+            return KnobType_t::SWIZZLE;
+        case CUDNN_KNOB_TYPE_TILE_SIZE:
+            return KnobType_t::TILE_SIZE;
+        case CUDNN_KNOB_TYPE_EDGE:
+            return KnobType_t::EDGE;
+        case CUDNN_KNOB_TYPE_MULTIPLY:
+            return KnobType_t::MULTIPLY;
+        case CUDNN_KNOB_TYPE_SPLIT_K_BUF:
+            return KnobType_t::SPLIT_K_BUF;
+        case CUDNN_KNOB_TYPE_TILEK:
+            return KnobType_t::TILEK;
+        case CUDNN_KNOB_TYPE_STAGES:
+            return KnobType_t::STAGES;
+        case CUDNN_KNOB_TYPE_REDUCTION_MODE:
+            return KnobType_t::REDUCTION_MODE;
+        case CUDNN_KNOB_TYPE_SPLIT_K_SLC:
+            return KnobType_t::SPLIT_K_SLC;
+        case CUDNN_KNOB_TYPE_IDX_MODE:
+            return KnobType_t::IDX_MODE;
+        case CUDNN_KNOB_TYPE_SPECFILT:
+            return KnobType_t::SPECFILT;
+        case CUDNN_KNOB_TYPE_KERNEL_CFG:
+            return KnobType_t::KERNEL_CFG;
+        case CUDNN_KNOB_TYPE_WORKSPACE:
+            return KnobType_t::WORKSPACE;
+#if (CUDNN_VERSION >= 8600)
+        case CUDNN_KNOB_TYPE_TILE_CGA_M:
+            return KnobType_t::TILE_CGA_M;
+        case CUDNN_KNOB_TYPE_TILE_CGA_N:
+            return KnobType_t::TILE_CGA_N;
+#endif
+#if (CUDNN_VERSION >= 8800)
+        case CUDNN_KNOB_TYPE_BLOCK_SIZE:
+            return KnobType_t::BLOCK_SIZE;
+#endif
+#if (CUDNN_VERSION >= 8900)
+        case CUDNN_KNOB_TYPE_OCCUPANCY:
+            return KnobType_t::OCCUPANCY;
+        case CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD:
+            return KnobType_t::ARRAY_SIZE_PER_THREAD;
+#endif
+#if (CUDNN_VERSION >= 8905)
+        case CUDNN_KNOB_TYPE_SPLIT_COLS:
+            return KnobType_t::SPLIT_COLS;
+        case CUDNN_KNOB_TYPE_TILE_ROWS:
+            return KnobType_t::TILE_ROWS;
+        case CUDNN_KNOB_TYPE_TILE_COLS:
+            return KnobType_t::TILE_COLS;
+        case CUDNN_KNOB_TYPE_LOAD_SIZE:
+            return KnobType_t::LOAD_SIZE;
+#endif
+#if (CUDNN_VERSION >= 90700)
+        case CUDNN_KNOB_TYPE_CTA_COUNT:
+            return KnobType_t::CTA_COUNT;
+        case CUDNN_KNOB_TYPE_STREAM_K:
+            return KnobType_t::STREAM_K;
+        case CUDNN_KNOB_TYPE_SPLIT_P_SLC:
+            return KnobType_t::SPLIT_P_SLC;
+        case CUDNN_KNOB_TYPE_TILE_M:
+            return KnobType_t::TILE_M;
+        case CUDNN_KNOB_TYPE_TILE_N:
+            return KnobType_t::TILE_N;
+        case CUDNN_KNOB_TYPE_WARP_SPEC_CFG:
+            return KnobType_t::WARP_SPEC_CFG;
+#endif
+        default:
+            return KnobType_t::NOT_SET;
+    }
+}
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/adaptive_layernorm.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/adaptive_layernorm.h
new file mode 100644
index 00000000..86eaa6d1
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/adaptive_layernorm.h
@@ -0,0 +1,454 @@
+#pragma once
+
+#include "../../cudnn_frontend_Heuristics.h"
+#include "../../cudnn_frontend_Logging.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+class AdaLayerNormNode : public NodeCRTP<AdaLayerNormNode> {
+   public:
+    AdaLayernorm_attributes attributes;
+
+    AdaLayerNormNode(AdaLayernorm_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::ADALAYERNORM;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for adalayernorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto X = attributes.inputs[AdaLayernorm_attributes::input_names::X];
+        auto Y = attributes.outputs[AdaLayernorm_attributes::output_names::Y];
+
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        // scale_bias dim is 1,c,h,w
+        auto scale_bias_dim = X->get_dim();
+        scale_bias_dim[0]   = 1;
+
+        auto scale = attributes.inputs[AdaLayernorm_attributes::input_names::SCALE];
+        // Only infer dims and strides if user did not set them
+        if (scale->get_dim().empty()) {
+            scale->set_dim(scale_bias_dim);
+        }
+        if (scale->get_stride().empty()) {
+            auto const& scale_dim = scale->get_dim();
+            std::vector<int64_t> stride_order;
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::generate_stride_order_preserving_format(X->get_stride(), scale_dim.size(), stride_order));
+            scale->set_stride(detail::generate_stride(scale_dim, stride_order));
+        }
+
+        auto bias = attributes.inputs[AdaLayernorm_attributes::input_names::BIAS];
+        // Only infer dims and strides if user did not set them
+        if (bias->get_dim().empty()) {
+            bias->set_dim(scale_bias_dim);
+        }
+        if (bias->get_stride().empty()) {
+            auto const& bias_dim = bias->get_dim();
+            std::vector<int64_t> stride_order;
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::generate_stride_order_preserving_format(X->get_stride(), bias_dim.size(), stride_order));
+            bias->set_stride(detail::generate_stride(bias_dim, stride_order));
+        }
+
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            // stats dim is x where scale == 1 else 1
+            auto stats_dim = X->get_dim();
+            for (size_t i = 1; i < stats_dim.size(); i++) {
+                if (scale->get_dim()[i] != 1) {
+                    stats_dim[i] = 1;
+                }
+            }
+
+            auto mean = attributes.outputs[AdaLayernorm_attributes::output_names::MEAN];
+            // Only infer dims and strides if user did not set them
+            if (mean->get_dim().empty()) {
+                mean->set_dim(stats_dim);
+            }
+            if (mean->get_stride().empty()) {
+                auto const& mean_dim = mean->get_dim();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(X->get_stride(), mean_dim.size(), stride_order));
+                mean->set_stride(detail::generate_stride(mean_dim, stride_order));
+            }
+
+            auto inv_var = attributes.outputs[AdaLayernorm_attributes::output_names::INV_VARIANCE];
+            // Only infer dims and strides if user did not set them
+            if (inv_var->get_dim().empty()) {
+                inv_var->set_dim(stats_dim);
+            }
+            if (inv_var->get_stride().empty()) {
+                auto const& inv_var_dim = inv_var->get_dim();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(X->get_stride(), inv_var_dim.size(), stride_order));
+                inv_var->set_stride(detail::generate_stride(inv_var_dim, stride_order));
+            }
+        }
+
+        // Set scalar tensors
+        std::vector<int64_t> ones(X->get_dim().size(), 1);
+        auto infer_scalar_tensors = [&ones](std::shared_ptr<Tensor_attributes>& T) {
+            // Only infer dims and strides if user did not set them
+            if (T->get_dim().empty()) {
+                T->set_dim(ones);
+            }
+            if (T->get_stride().empty()) {
+                T->set_stride(ones);
+            }
+        };
+        infer_scalar_tensors(attributes.inputs[AdaLayernorm_attributes::input_names::EPSILON]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: " << "Validating AdaLayerNormNode " << attributes.name);
+        // Norm forward phase should be set
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.forward_phase == NormFwdPhase_t::NOT_SET,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Forward phase not set of adalayernorm node.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: " << "Building AdaLayernorm operations " << attributes.name << std::endl;
+
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "AdaLN fwd requires cuDNN v9.9.0"};
+#if (CUDNN_VERSION >= 90900)
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90900, cudnn_ver_error);
+        CUDNN_FRONTEND_UNUSED(operations);
+        auto adalayernorm_operation =
+            make_shared_backend_pointer((cudnnBackendDescriptorType_t)CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR);
+
+        cudnnBackendNormMode_t cudnn_norm_mode;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::ADA_LAYER_NORM, cudnn_norm_mode));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.forward_phase, cudnn_norm_fwd_phase));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                                       1,
+                                                       &cudnn_norm_fwd_phase));
+
+        auto X         = attributes.inputs.find(AdaLayernorm_attributes::input_names::X)->second;
+        auto backend_x = tensors[X->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_x));
+
+        auto Scale         = attributes.inputs.find(AdaLayernorm_attributes::input_names::SCALE)->second;
+        auto backend_scale = tensors[Scale->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_scale));
+
+        auto Bias_iter = attributes.inputs.find(AdaLayernorm_attributes::input_names::BIAS);
+        if (Bias_iter != attributes.inputs.end() && Bias_iter->second->get_is_virtual() == false) {
+            auto backend_bias = tensors[Bias_iter->second->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_bias));
+        }
+
+        auto Epsilon         = attributes.inputs.find(AdaLayernorm_attributes::input_names::EPSILON)->second;
+        auto backend_epsilon = tensors[Epsilon->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_epsilon));
+
+        auto Y         = attributes.outputs.find(AdaLayernorm_attributes::output_names::Y)->second;
+        auto backend_y = tensors[Y->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_y));
+
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            auto Mean         = attributes.outputs.find(AdaLayernorm_attributes::output_names::MEAN)->second;
+            auto backend_mean = tensors[Mean->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_mean));
+
+            auto Inv_variance = attributes.outputs.find(AdaLayernorm_attributes::output_names::INV_VARIANCE)->second;
+            auto backend_inv_variance = tensors[Inv_variance->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_inv_variance));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(adalayernorm_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(adalayernorm_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "ADA_LAYER_NORM"})"_json);
+    }
+#endif
+};
+
+/*******/
+
+class DAdaLayerNormNode : public NodeCRTP<DAdaLayerNormNode> {
+   public:
+    AdaLayernorm_backward_attributes attributes;
+
+    DAdaLayerNormNode(AdaLayernorm_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DADALAYERNORM;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for DAdaLayerNorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferencing from X works today.
+        auto X                  = attributes.inputs[AdaLayernorm_backward_attributes::input_names::X];
+        auto const x_tensor_dim = X->get_dim();
+
+        auto DY            = attributes.inputs[AdaLayernorm_backward_attributes::input_names::DY];
+        auto dy_tensor_dim = DY->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (dy_tensor_dim.empty()) {
+            dy_tensor_dim.resize(x_tensor_dim.size());
+            DY->set_dim(x_tensor_dim);
+        }
+        if (DY->get_stride().empty()) {
+            auto const& DY_dim = DY->get_dim();
+            // Default to NCHW
+            auto const& stride_order = detail::generate_row_major_stride_order(DY_dim.size());
+            DY->set_stride(detail::generate_stride(DY_dim, stride_order));
+        }
+
+        auto DX            = attributes.outputs[AdaLayernorm_backward_attributes::output_names::DX];
+        auto dx_tensor_dim = DX->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (dx_tensor_dim.empty()) {
+            dx_tensor_dim.resize(x_tensor_dim.size());
+            DX->set_dim(x_tensor_dim);
+        }
+        if (DX->get_stride().empty()) {
+            auto const& DX_dim = DX->get_dim();
+            // Default to NCHW
+            auto const& stride_order = detail::generate_row_major_stride_order(DX_dim.size());
+            DX->set_stride(detail::generate_stride(DX_dim, stride_order));
+        }
+
+        auto SCALE          = attributes.inputs[AdaLayernorm_backward_attributes::input_names::SCALE];
+        auto scale_bias_dim = SCALE->get_dim();
+
+        // Set channel length tensors
+        auto infer_scale_bias_tensors = [&scale_bias_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                T->set_dim(scale_bias_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NCHW
+                auto const& stride_order = detail::generate_row_major_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+
+        infer_scale_bias_tensors(attributes.outputs[AdaLayernorm_backward_attributes::output_names::DSCALE]);
+        auto DBIAS = attributes.outputs.at(AdaLayernorm_backward_attributes::output_names::DBIAS);
+        if (DBIAS->get_is_virtual() == false) {
+            infer_scale_bias_tensors(DBIAS);
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating DAdaLayerNormNode node " << attributes.name);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: " << "Building DAdaLayerNormNode operations " << attributes.name
+                    << std::endl;
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "AdaLN bwd requires cuDNN v9.9.0"};
+#if (CUDNN_VERSION >= 90900)
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90900, cudnn_ver_error);
+        CUDNN_FRONTEND_UNUSED(operations);
+        auto adalayernorm_operation =
+            make_shared_backend_pointer((cudnnBackendDescriptorType_t)CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR);
+
+        cudnnBackendNormMode_t cudnn_norm_mode;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::ADA_LAYER_NORM, cudnn_norm_mode));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        auto X         = attributes.inputs.find(AdaLayernorm_backward_attributes::input_names::X)->second;
+        auto backend_x = tensors[X->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_x));
+
+        auto Mean         = attributes.inputs.find(AdaLayernorm_backward_attributes::input_names::MEAN)->second;
+        auto backend_mean = tensors[Mean->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_mean));
+
+        auto Inv_variance = attributes.inputs.find(AdaLayernorm_backward_attributes::input_names::INV_VARIANCE)->second;
+        auto backend_inv_variance = tensors[Inv_variance->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_inv_variance));
+
+        auto Dy         = attributes.inputs.find(AdaLayernorm_backward_attributes::input_names::DY)->second;
+        auto backend_dy = tensors[Dy->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_dy));
+
+        auto Scale         = attributes.inputs.find(AdaLayernorm_backward_attributes::input_names::SCALE)->second;
+        auto backend_scale = tensors[Scale->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_scale));
+
+        auto Dx         = attributes.outputs.find(AdaLayernorm_backward_attributes::output_names::DX)->second;
+        auto backend_dx = tensors[Dx->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_dx));
+
+        auto Dscale         = attributes.outputs.find(AdaLayernorm_backward_attributes::output_names::DSCALE)->second;
+        auto backend_dscale = tensors[Dscale->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_dscale));
+
+        auto Dbias_iter = attributes.outputs.find(AdaLayernorm_backward_attributes::output_names::DBIAS);
+        if (Dbias_iter != attributes.outputs.end() && Dbias_iter->second->get_is_virtual() == false) {
+            auto backend_dbias = tensors[Dbias_iter->second->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(adalayernorm_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_dbias));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(adalayernorm_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(adalayernorm_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "ADA_LAYER_NORM_BPROP"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/batchnorm.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/batchnorm.h
new file mode 100644
index 00000000..69d232be
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/batchnorm.h
@@ -0,0 +1,268 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+class BatchNormNode : public NodeCRTP<BatchNormNode> {
+   public:
+    Batchnorm_attributes attributes;
+
+    BatchNormNode(Batchnorm_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::BATCHNORM;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for batchnorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto X = attributes.inputs[Batchnorm_attributes::input_names::X];
+        auto Y = attributes.outputs[Batchnorm_attributes::output_names::Y];
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        // Set channel length tensors
+        auto const x_tensor_dim        = X->get_dim();
+        auto infer_per_channel_tensors = [&x_tensor_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                tensor_dim.resize(x_tensor_dim.size(), 1);
+                tensor_dim[1] = x_tensor_dim[1];
+                T->set_dim(tensor_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+        infer_per_channel_tensors(attributes.outputs[Batchnorm_attributes::output_names::MEAN]);
+        infer_per_channel_tensors(attributes.outputs[Batchnorm_attributes::output_names::INV_VARIANCE]);
+
+        auto has_running_stats = attributes.inputs[Batchnorm_attributes::input_names::PREV_RUNNING_MEAN] ||
+                                 attributes.inputs[Batchnorm_attributes::input_names::PREV_RUNNING_VAR];
+
+        if (has_running_stats) {
+            infer_per_channel_tensors(attributes.outputs[Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN]);
+            infer_per_channel_tensors(attributes.outputs[Batchnorm_attributes::output_names::NEXT_RUNNING_VAR]);
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building BatchNormNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 batchnorm_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            batchnorm_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR));
+
+        // Set norm mode to BATCH_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::BATCH_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set forward phase to TRAINING
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormFwdPhase_t::TRAINING, cudnn_norm_fwd_phase));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                                       1,
+                                                       &cudnn_norm_fwd_phase));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set saved mean and inv_variance
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Batchnorm_attributes::output_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Batchnorm_attributes::output_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set scale and bias tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_attributes::input_names::BIAS);
+        auto bias_desc = tensors.at(BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &bias_desc));
+
+        // Set epsilon tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Batchnorm_attributes::input_names::EPSILON);
+        auto epsilon_desc = tensors.at(EPSILON->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &epsilon_desc));
+
+        // Check for running stats
+        bool has_running_stats = true;
+        auto it                = attributes.inputs.find(Batchnorm_attributes::input_names::PREV_RUNNING_MEAN);
+        if (it == attributes.inputs.end() || it->second == nullptr) {
+            has_running_stats = false;
+        }
+
+        if (has_running_stats) {
+            // Set momentum (exp decay factor)
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, Batchnorm_attributes::input_names::MOMENTUM);
+            auto momentum_desc = tensors.at(MOMENTUM->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &momentum_desc));
+
+            // Set prev running mean and var
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN,
+                                                      Batchnorm_attributes::input_names::PREV_RUNNING_MEAN);
+            auto prev_mean_desc = tensors.at(PREV_RUNNING_MEAN->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &prev_mean_desc));
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR,
+                                                      Batchnorm_attributes::input_names::PREV_RUNNING_VAR);
+            auto prev_var_desc = tensors.at(PREV_RUNNING_VAR->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &prev_var_desc));
+
+            // Set next running mean and var
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN,
+                                                       Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN);
+            auto next_mean_desc = tensors.at(NEXT_RUNNING_MEAN->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &next_mean_desc));
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR,
+                                                       Batchnorm_attributes::output_names::NEXT_RUNNING_VAR);
+            auto next_var_desc = tensors.at(NEXT_RUNNING_VAR->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &next_var_desc));
+        }
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        // Set peer stat tensors if any
+        if (!attributes.peer_stats.empty()) {
+            std::vector<cudnnBackendDescriptor_t> peer_stat_descs;
+            for (auto const& peer_stat : attributes.peer_stats) {
+                peer_stat_descs.push_back(tensors.at(peer_stat->get_uid())->get_raw_desc());
+            }
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           peer_stat_descs.size(),
+                                                           peer_stat_descs.data()));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(batchnorm_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(batchnorm_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "BATCHNORM"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/batchnorm_inference.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/batchnorm_inference.h
new file mode 100644
index 00000000..9fb433f9
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/batchnorm_inference.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+class BatchnormInferenceNode : public NodeCRTP<BatchnormInferenceNode> {
+   public:
+    Batchnorm_inference_attributes attributes;
+
+    BatchnormInferenceNode(Batchnorm_inference_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::BATCHNORM_INFERENCE;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for batchnorm inference node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto X = attributes.inputs[Batchnorm_inference_attributes::input_names::X];
+        auto Y = attributes.outputs[Batchnorm_inference_attributes::output_names::Y];
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building BatchnormInferenceNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 batchnorm_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            batchnorm_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR));
+
+        // Set norm mode to BATCH_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::BATCH_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set forward phase to INFERENCE
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormFwdPhase_t::INFERENCE, cudnn_norm_fwd_phase));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                                       1,
+                                                       &cudnn_norm_fwd_phase));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_inference_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set mean and inv_variance (as inputs for inference)
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_inference_attributes::input_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Batchnorm_inference_attributes::input_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set scale and bias tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_inference_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Batchnorm_inference_attributes::input_names::BIAS);
+        auto bias_desc = tensors.at(BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &bias_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Batchnorm_inference_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(batchnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(batchnorm_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(batchnorm_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "BATCHNORM_INFERENCE"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/block_scale_dequantize.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/block_scale_dequantize.h
new file mode 100644
index 00000000..bbaa2bf0
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/block_scale_dequantize.h
@@ -0,0 +1,169 @@
+#pragma once
+
+#include "../../cudnn_frontend_Logging.h"
+#include "../../cudnn_frontend_shim.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class BlockScaleDequantizeNode : public NodeCRTP<BlockScaleDequantizeNode> {
+   public:
+    Block_scale_dequantize_attributes attributes;
+
+    BlockScaleDequantizeNode(Block_scale_dequantize_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::BLOCK_SCALE_DEQUANTIZE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        getLogger() << "[cudnn_frontend] INFO: "
+                    << "Validating BlockScaleDequantizeNode " << attributes.name << std::endl;
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.block_size.empty(), error_code_t::ATTRIBUTE_NOT_SET, "Block size not set\n");
+
+        auto Y = attributes.outputs.at(Block_scale_dequantize_attributes::output_names::Y);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(!(Y->get_is_virtual()),
+                                       error_code_t::INVALID_VALUE,
+                                       "Output tensor of dequantize node should be virtual\n");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        getLogger() << "[cudnn_frontend] INFO: Inferencing properties for BlockScaleDequantizeNode " << attributes.name
+                    << std::endl;
+
+        attributes.fill_from_context(context);
+
+        auto X     = attributes.inputs[Block_scale_dequantize_attributes::input_names::X];
+        auto scale = attributes.inputs[Block_scale_dequantize_attributes::input_names::scale];
+        auto Y     = attributes.outputs[Block_scale_dequantize_attributes::output_names::Y];
+
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: "
+                    << "Building BlockScaleDequantizeNode operations " << attributes.name << std::endl;
+        auto cudnn_ver_error =
+            error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Block scale dequantize requires cuDNN v9.7.0"};
+
+#if (CUDNN_VERSION >= 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnn_ver_error);
+        CUDNN_FRONTEND_UNUSED(operations);
+        auto block_scale_dequantize_operation = make_shared_backend_pointer(
+            (cudnnBackendDescriptorType_t)CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR);
+
+        auto X         = attributes.inputs.find(Block_scale_dequantize_attributes::input_names::X)->second;
+        auto backend_x = tensors[X->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_dequantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_x));
+
+        auto scale         = attributes.inputs.find(Block_scale_dequantize_attributes::input_names::scale)->second;
+        auto backend_scale = tensors[scale->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_dequantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_scale));
+
+        auto Y         = attributes.outputs.find(Block_scale_dequantize_attributes::output_names::Y)->second;
+        auto backend_y = tensors[Y->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_dequantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_y));
+
+        cudnnDataType_t cudnn_data_type;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_dequantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        const int32_t* block_size = attributes.block_size.data();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_dequantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE,
+                                                       CUDNN_TYPE_INT32,
+                                                       attributes.block_size.size(),
+                                                       block_size));
+
+#if (CUDNN_VERSION >= 91400)
+        if (detail::get_backend_version() >= 91400) {
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_dequantize_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_NEG_SCALE,
+                                                           CUDNN_TYPE_BOOLEAN,
+                                                           1,
+                                                           &attributes.is_negative_scale));
+        }
+#endif
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(block_scale_dequantize_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(block_scale_dequantize_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "BLOCK_SCALE_DEQUANTIZE"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::block_scale_dequantize(std::shared_ptr<Tensor_attributes> x,
+                              std::shared_ptr<Tensor_attributes> scale,
+                              Block_scale_dequantize_attributes attributes,
+                              std::shared_ptr<Tensor_attributes> y) {
+    attributes.inputs[Block_scale_dequantize_attributes::input_names::X]     = x;
+    attributes.inputs[Block_scale_dequantize_attributes::input_names::scale] = scale;
+    attributes.outputs[Block_scale_dequantize_attributes::output_names::Y]   = y;
+    sub_nodes.emplace_back(std::make_unique<BlockScaleDequantizeNode>(std::move(attributes), context));
+}
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/block_scale_quantize.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/block_scale_quantize.h
new file mode 100644
index 00000000..aba536fc
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/block_scale_quantize.h
@@ -0,0 +1,205 @@
+#pragma once
+
+#include "../../cudnn_frontend_Logging.h"
+#include "../../cudnn_frontend_shim.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class BlockScaleQuantizeNode : public NodeCRTP<BlockScaleQuantizeNode> {
+   public:
+    Block_scale_quantize_attributes attributes;
+
+    BlockScaleQuantizeNode(Block_scale_quantize_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::BLOCK_SCALE_QUANTIZE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        getLogger() << "[cudnn_frontend] INFO: " << "Validating BlockScaleQuantizeNode " << attributes.name
+                    << std::endl;
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            !attributes.block_size.has_value(), error_code_t::ATTRIBUTE_NOT_SET, "Block size not set.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        getLogger() << "[cudnn_frontend] INFO: Inferencing properties for BlockScaleQuantizeNode " << attributes.name
+                    << std::endl;
+
+        attributes.fill_from_context(context);
+
+        auto X     = attributes.inputs[Block_scale_quantize_attributes::input_names::X];
+        auto Y     = attributes.outputs[Block_scale_quantize_attributes::output_names::Y];
+        auto scale = attributes.outputs[Block_scale_quantize_attributes::output_names::scale];
+
+        // Block scale quantize requires the block scale axis to be packed
+        auto infer_strides_transposed = [&X](std::shared_ptr<Tensor_attributes>& T,
+                                             std::optional<int64_t> const& axis) {
+            auto const& dim      = T->get_dim();
+            auto const& X_dim    = X->get_dim();
+            auto const& X_stride = X->get_stride();
+
+            std::vector<int64_t> indices(X_stride.size());
+            std::iota(indices.begin(), indices.end(), 0);
+            // Sort indices based on stride values in descending order
+            std::sort(indices.begin(), indices.end(), [&X_dim, &X_stride](int64_t i, int64_t j) {
+                // Prioritize singleton dimensions
+                if (X_stride[i] == X_stride[j]) {
+                    return (X_dim[i] == 1) || (X_dim[j] != 1);
+                }
+                return X_stride[i] < X_stride[j];
+            });
+            if (axis) {
+                // Rotate left until the axis is the packed dim
+                std::rotate(indices.begin(), std::find(indices.begin(), indices.end(), axis.value()), indices.end());
+            }
+            std::vector<int64_t> stride_order(X_stride.size());
+            for (size_t i = 0; i < indices.size(); ++i) {
+                stride_order[indices[i]] = i;
+            }
+            T->set_stride(detail::generate_stride(dim, stride_order));
+        };
+
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            if (attributes.transpose) {
+                infer_strides_transposed(Y, attributes.axis);
+            } else {
+                Y->set_stride(X->get_stride());
+            }
+        }
+
+        // Only infer dims and strides if user did not set them
+        if (scale->get_dim().empty()) {
+            auto scale_dim = X->get_dim();
+            if (attributes.axis) {
+                scale_dim[attributes.axis.value()] /= attributes.block_size.value();
+            } else {
+                scale_dim.back() /= attributes.block_size.value();
+            }
+            scale->set_dim(scale_dim);
+        }
+        if (scale->get_stride().empty()) {
+            if (attributes.transpose) {
+                infer_strides_transposed(scale, attributes.axis);
+            } else {
+                auto const& scale_dim = scale->get_dim();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(X->get_stride(), scale_dim.size(), stride_order));
+                scale->set_stride(detail::generate_stride(scale_dim, stride_order));
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: " << "Building BlockScaleQuantizeNode operations " << attributes.name
+                    << std::endl;
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Block scale quantize requires cuDNN v9.7.0"};
+
+#if (CUDNN_VERSION >= 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnn_ver_error);
+        CUDNN_FRONTEND_UNUSED(operations);
+        auto block_scale_quantize_operation = make_shared_backend_pointer(
+            (cudnnBackendDescriptorType_t)CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR);
+
+        auto X         = attributes.inputs.find(Block_scale_quantize_attributes::input_names::X)->second;
+        auto backend_x = tensors[X->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_quantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_x));
+
+        auto Y         = attributes.outputs.find(Block_scale_quantize_attributes::output_names::Y)->second;
+        auto backend_y = tensors[Y->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_quantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_y));
+
+        auto scale         = attributes.outputs.find(Block_scale_quantize_attributes::output_names::scale)->second;
+        auto backend_scale = tensors[scale->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_quantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_scale));
+
+        cudnnDataType_t cudnn_data_type;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_quantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        int32_t block_size = attributes.block_size.value();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(block_scale_quantize_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE,
+                                                       CUDNN_TYPE_INT32,
+                                                       1,
+                                                       &block_size));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(block_scale_quantize_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(block_scale_quantize_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "BLOCK_SCALE_QUANTIZE"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::block_scale_quantize(std::shared_ptr<Tensor_attributes> x,
+                            Block_scale_quantize_attributes attributes,
+                            std::shared_ptr<Tensor_attributes> y,
+                            std::shared_ptr<Tensor_attributes> scale) {
+    attributes.inputs[Block_scale_quantize_attributes::input_names::X]       = x;
+    attributes.outputs[Block_scale_quantize_attributes::output_names::Y]     = y;
+    attributes.outputs[Block_scale_quantize_attributes::output_names::scale] = scale;
+    sub_nodes.emplace_back(std::make_unique<BlockScaleQuantizeNode>(std::move(attributes), context));
+}
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/bn_finalize.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/bn_finalize.h
new file mode 100644
index 00000000..99d3ec0c
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/bn_finalize.h
@@ -0,0 +1,263 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class BatchNormFinalizeNode : public NodeCRTP<BatchNormFinalizeNode> {
+   public:
+    BN_finalize_attributes attributes;
+
+    BatchNormFinalizeNode(BN_finalize_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::BN_FINALIZE;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:Inferencing properties for batchnorm finalize node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto SUM                  = attributes.inputs[BN_finalize_attributes::input_names::SUM];
+        auto const sum_tensor_dim = SUM->get_dim();
+
+        // Set channel length tensors
+        auto infer_per_channel_tensors = [&sum_tensor_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                tensor_dim = sum_tensor_dim;
+                T->set_dim(tensor_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+        infer_per_channel_tensors(attributes.outputs[BN_finalize_attributes::output_names::EQ_BIAS]);
+        infer_per_channel_tensors(attributes.outputs[BN_finalize_attributes::output_names::EQ_SCALE]);
+        infer_per_channel_tensors(attributes.outputs[BN_finalize_attributes::output_names::MEAN]);
+        infer_per_channel_tensors(attributes.outputs[BN_finalize_attributes::output_names::INV_VARIANCE]);
+        infer_per_channel_tensors(attributes.outputs[BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN]);
+        infer_per_channel_tensors(attributes.outputs[BN_finalize_attributes::output_names::NEXT_RUNNING_VAR]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building BatchNormFinalizeNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 bn_finalize_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(bn_finalize_operation.initialize_managed_backend_pointer(
+            CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR));
+
+        // Set BN finalize mode
+        cudnnBnFinalizeStatsMode_t bn_finalize_mode = CUDNN_BN_FINALIZE_STATISTICS_TRAINING;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE,
+                                                       CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
+                                                       1,
+                                                       &bn_finalize_mode));
+
+        // Set compute type (math precision)
+        cudnnDataType_t compute_type = CUDNN_DATA_FLOAT;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &compute_type));
+
+        // Set SUM input tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SUM, BN_finalize_attributes::input_names::SUM);
+        auto sum_desc = tensors.at(SUM->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &sum_desc));
+
+        // Set SQ_SUM input tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SQ_SUM, BN_finalize_attributes::input_names::SQ_SUM);
+        auto sq_sum_desc = tensors.at(SQ_SUM->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &sq_sum_desc));
+
+        // Set SCALE input tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, BN_finalize_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set BIAS input tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, BN_finalize_attributes::input_names::BIAS);
+        auto bias_desc = tensors.at(BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &bias_desc));
+
+        // Set EQ_SCALE output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE, BN_finalize_attributes::output_names::EQ_SCALE);
+        auto eq_scale_desc = tensors.at(EQ_SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &eq_scale_desc));
+
+        // Set EQ_BIAS output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, BN_finalize_attributes::output_names::EQ_BIAS);
+        auto eq_bias_desc = tensors.at(EQ_BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &eq_bias_desc));
+
+        // Set PREV_RUNNING_MEAN input tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_MEAN,
+                                                  BN_finalize_attributes::input_names::PREV_RUNNING_MEAN);
+        auto prev_running_mean_desc = tensors.at(PREV_RUNNING_MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &prev_running_mean_desc));
+
+        // Set PREV_RUNNING_VAR input tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(PREV_RUNNING_VAR,
+                                                  BN_finalize_attributes::input_names::PREV_RUNNING_VAR);
+        auto prev_running_var_desc = tensors.at(PREV_RUNNING_VAR->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &prev_running_var_desc));
+
+        // Set NEXT_RUNNING_MEAN output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_MEAN,
+                                                   BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN);
+        auto next_running_mean_desc = tensors.at(NEXT_RUNNING_MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &next_running_mean_desc));
+
+        // Set NEXT_RUNNING_VAR output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(NEXT_RUNNING_VAR,
+                                                   BN_finalize_attributes::output_names::NEXT_RUNNING_VAR);
+        auto next_running_var_desc = tensors.at(NEXT_RUNNING_VAR->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &next_running_var_desc));
+
+        // Set MEAN output tensor (saved mean)
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, BN_finalize_attributes::output_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        // Set INV_VARIANCE output tensor (saved inv std)
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, BN_finalize_attributes::output_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set EPSILON tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, BN_finalize_attributes::input_names::EPSILON);
+        auto epsilon_desc = tensors.at(EPSILON->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &epsilon_desc));
+
+        // Set MOMENTUM tensor (exp average factor)
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MOMENTUM, BN_finalize_attributes::input_names::MOMENTUM);
+        auto momentum_desc = tensors.at(MOMENTUM->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &momentum_desc));
+
+        // Set ACCUM_COUNT tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(ACCUM_COUNT, BN_finalize_attributes::input_names::ACCUM_COUNT);
+        auto accum_count_desc = tensors.at(ACCUM_COUNT->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_finalize_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &accum_count_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(bn_finalize_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(bn_finalize_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "BN_FINALIZE"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/concatenate.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/concatenate.h
new file mode 100644
index 00000000..0c051186
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/concatenate.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include "../../cudnn_frontend_Logging.h"
+#include "../../cudnn_frontend_shim.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+#include <string>
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class ConcatenateNode : public NodeCRTP<ConcatenateNode> {
+   public:
+    Concatenate_attributes attributes;
+
+    ConcatenateNode(Concatenate_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::CONCATENATE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        getLogger() << "[cudnn_frontend] INFO: " << "Validating ConcatenateNode " << attributes.name << std::endl;
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(!attributes.axis.has_value(), error_code_t::ATTRIBUTE_NOT_SET, "Axis not set\n");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            !attributes.in_place_index.has_value(), error_code_t::ATTRIBUTE_NOT_SET, "In-place index not set\n");
+
+        auto X = attributes.inputs;
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (X.size() == 0), error_code_t::INVALID_VALUE, "Input size of the concatenate node cannot be zero\n");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        getLogger() << "[cudnn_frontend] INFO: Inferring properties for ConcatenateNode " << attributes.name
+                    << std::endl;
+
+        attributes.fill_from_context(context);
+
+        auto Y = attributes.outputs[Concatenate_attributes::output_names::Y];
+
+        // Infer dims and strides only if user did not set them
+        int64_t dim_sum = 0;
+        for (const auto& input : attributes.inputs) {
+            dim_sum += input->get_dim()[attributes.axis.value()];
+        }
+
+        auto X                        = attributes.inputs[0];
+        auto dims                     = X->get_dim();
+        dims[attributes.axis.value()] = dim_sum;
+
+        if (Y->get_dim().empty()) {
+            Y->set_dim(dims);
+            Y->set_dim(dims);
+        }
+
+        if (Y->get_stride().empty()) {
+            std::vector<int64_t> stride_order;
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::generate_stride_order_preserving_format(X->get_stride(), dims.size(), stride_order));
+            Y->set_stride(detail::generate_stride(dims, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: " << "Building ConcatenateNode operations " << attributes.name
+                    << std::endl;
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Concatenate requires cuDNN v9.7.0"};
+
+#if (CUDNN_VERSION >= 90700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnn_ver_error);
+        CUDNN_FRONTEND_UNUSED(operations);
+        auto concatenate_operation = make_shared_backend_pointer(CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR);
+
+        std::vector<void*> backend_x(attributes.inputs.size());
+        size_t index = 0;
+        for (const auto& input : attributes.inputs) {
+            backend_x[index] = tensors[input->get_uid()]->get_desc()->get_backend_descriptor();
+            index++;
+        }
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(concatenate_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       attributes.inputs.size(),
+                                                       backend_x.data()));
+
+        auto Y         = attributes.outputs.find(Concatenate_attributes::output_names::Y)->second;
+        auto backend_y = tensors[Y->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(concatenate_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_y));
+
+        int64_t axis = attributes.axis.value();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(concatenate_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_CONCAT_AXIS,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &axis));
+
+        int64_t in_place_index = attributes.in_place_index.value();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(concatenate_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &in_place_index));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(concatenate_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(concatenate_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "CONCATENATE"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::concatenate(std::vector<std::shared_ptr<Tensor_attributes>> x,
+                   Concatenate_attributes attributes,
+                   std::shared_ptr<Tensor_attributes> y) {
+    for (auto& element : x) {
+        attributes.inputs.push_back(element);
+    }
+    attributes.outputs[Concatenate_attributes::output_names::Y] = y;
+    sub_nodes.emplace_back(std::make_unique<ConcatenateNode>(std::move(attributes), context));
+}
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_dgrad.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_dgrad.h
new file mode 100644
index 00000000..a2e2e4e8
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_dgrad.h
@@ -0,0 +1,205 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class DgradNode : public NodeCRTP<DgradNode> {
+   public:
+    Conv_dgrad_attributes attributes;
+
+    DgradNode(Conv_dgrad_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DGRAD;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating Node Type::DGRAD " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for dgrad node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferrencing from (X, DY) -> DW works today.
+        auto DX = attributes.outputs.find(Conv_dgrad_attributes::output_names::DX)->second;
+        auto W  = attributes.inputs.find(Conv_dgrad_attributes::input_names::W)->second;
+        auto DY = attributes.inputs.find(Conv_dgrad_attributes::input_names::DY)->second;
+
+        auto const w_tensor_dim  = W->get_dim();
+        auto const dy_tensor_dim = DY->get_dim();
+        auto dx_tensor_dim       = DX->get_dim();
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(DX->get_dim().empty(),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "For dgrad node, output dimension inferencing is not possible.");
+
+        // No dim inferencing as inverse mapping from DY, W to DX is not unique.
+        // Only infer strides if user did not set them
+        if (DX->get_stride().empty()) {
+            auto const& DX_dim = DX->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DX_dim.size());
+            DX->set_stride(detail::generate_stride(DX_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building DgradNode operations " << attributes.name << " ");
+
+        // Create dgrad descriptor by directly calling cuDNN backend API
+        ConvDesc_v8 dgrad_descriptor;
+        int64_t const spatial_dim_count = attributes.get_pre_padding().size();
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            dgrad_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR));
+
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        cudnnConvolutionMode_t mode = detail::convert_to_cudnn_type(attributes.math_mode);
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            dgrad_descriptor.get_raw_desc(), CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &spatial_dim_count));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_pre_padding().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_post_padding().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_DILATIONS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_dilation().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_stride().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(dgrad_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(dgrad_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 dgrad_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(dgrad_operation.initialize_managed_backend_pointer(
+            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Conv_dgrad_attributes::output_names::DX);
+        auto dx_desc = tensors.at(DX->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dx_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_dgrad_attributes::input_names::W);
+        auto w_desc = tensors.at(W->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &w_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_dgrad_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        auto conv_desc_ptr = dgrad_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &conv_desc_ptr));
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA,
+                                                       CUDNN_TYPE_FLOAT,
+                                                       1,
+                                                       &alpha));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA,
+                                                       CUDNN_TYPE_FLOAT,
+                                                       1,
+                                                       &beta));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(dgrad_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(dgrad_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "CONV_DGRAD"})"_json);
+    }
+#endif
+};
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_fprop.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_fprop.h
new file mode 100644
index 00000000..f8f22ec7
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_fprop.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+class ConvolutionNode : public NodeCRTP<ConvolutionNode> {
+   public:
+    Conv_fprop_attributes attributes;
+
+    ConvolutionNode(Conv_fprop_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::CONVOLUTION;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating Node Type::CONVOLUTION " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for conv node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferrencing from (X, W) -> Y works today.
+        auto& X = attributes.inputs.find(Conv_fprop_attributes::input_names::X)->second;
+        auto& W = attributes.inputs.find(Conv_fprop_attributes::input_names::W)->second;
+        auto& Y = attributes.outputs.find(Conv_fprop_attributes::output_names::Y)->second;
+
+        auto const x_tensor_dim = X->get_dim();
+        auto const w_tensor_dim = W->get_dim();
+        auto y_tensor_dim       = Y->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (y_tensor_dim.empty()) {
+            y_tensor_dim.resize(x_tensor_dim.size());
+            auto const& pre_padding  = attributes.get_pre_padding();
+            auto const& post_padding = attributes.get_post_padding();
+            auto const& stride       = attributes.get_stride();
+            auto const& dilation     = attributes.get_dilation();
+            // N
+            y_tensor_dim[0] = x_tensor_dim[0];
+            // PQ
+            for (size_t dim = 2; dim < x_tensor_dim.size(); ++dim) {
+                y_tensor_dim[dim] = 1 + (x_tensor_dim[dim] - dilation[dim - 2] * (w_tensor_dim[dim] - 1) - 1 +
+                                         pre_padding[dim - 2] + post_padding[dim - 2]) /
+                                            stride[dim - 2];
+            }
+            // K
+            y_tensor_dim[1] = w_tensor_dim[0];
+            Y->set_dim(y_tensor_dim);
+        }
+        if (Y->get_stride().empty()) {
+            auto const& Y_dim = Y->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(Y_dim.size());
+            Y->set_stride(detail::generate_stride(Y_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building ConvolutionNode operations " << attributes.name << " ");
+
+        // Create convolution descriptor by directly calling cuDNN backend API
+        ConvDesc_v8 convolution_descriptor;
+        int64_t const spatial_dim_count = attributes.get_pre_padding().size();
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            convolution_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR));
+
+        // Set compute type
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        // Set convolution mode
+        cudnnConvolutionMode_t mode = detail::convert_to_cudnn_type(attributes.math_mode);
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_CONV_MODE,
+                                                       CUDNN_TYPE_CONVOLUTION_MODE,
+                                                       1,
+                                                       &mode));
+
+        // Set spatial dimensions
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &spatial_dim_count));
+
+        // Set pre-padding
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_pre_padding().data()));
+
+        // Set post-padding
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_post_padding().data()));
+
+        // Set dilation
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_DILATIONS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_dilation().data()));
+
+        // Set strides
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_stride().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(convolution_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(convolution_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 convolution_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(convolution_operation.initialize_managed_backend_pointer(
+            CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_fprop_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set weight tensor W
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(W, Conv_fprop_attributes::input_names::W);
+        auto w_desc = tensors.at(W->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &w_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Conv_fprop_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        // Set convolution descriptor
+        auto conv_desc_ptr = convolution_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &conv_desc_ptr));
+
+        // Set alpha and beta
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA,
+                                                       CUDNN_TYPE_FLOAT,
+                                                       1,
+                                                       &alpha));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(convolution_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA,
+                                                       CUDNN_TYPE_FLOAT,
+                                                       1,
+                                                       &beta));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(convolution_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(convolution_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "CONV_FPROP"})"_json);
+    }
+#endif
+};
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_wgrad.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_wgrad.h
new file mode 100644
index 00000000..2f9b478c
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/conv_wgrad.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class WgradNode : public NodeCRTP<WgradNode> {
+   public:
+    Conv_wgrad_attributes attributes;
+
+    WgradNode(Conv_wgrad_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::WGRAD;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating Node Type::WGRAD " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_pre_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Pre padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_post_padding().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Post padding not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_stride().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv strides not set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.get_dilation().empty(), error_code_t::ATTRIBUTE_NOT_SET, "Conv dilation not set.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for conv node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferrencing from (X, DY) -> DW works today.
+        auto X  = attributes.inputs[Conv_wgrad_attributes::input_names::X];
+        auto DW = attributes.outputs[Conv_wgrad_attributes::output_names::DW];
+        auto DY = attributes.inputs[Conv_wgrad_attributes::input_names::DY];
+
+        auto const x_tensor_dim  = X->get_dim();
+        auto const dy_tensor_dim = DY->get_dim();
+        auto dw_tensor_dim       = DW->get_dim();
+
+        // No dim inferencing as inverse mapping from DY, X to DX is not unique.
+        // Only infer strides if user did not set them
+        if (DW->get_stride().empty()) {
+            auto const& DW_dim = DW->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DW_dim.size());
+            DW->set_stride(detail::generate_stride(DW_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building WgradNode operations " << attributes.name << " ");
+
+        // Create wgrad descriptor by directly calling cuDNN backend API
+        ConvDesc_v8 wgrad_descriptor;
+        int64_t const spatial_dim_count = attributes.get_pre_padding().size();
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            wgrad_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR));
+
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        cudnnConvolutionMode_t mode = detail::convert_to_cudnn_type(attributes.math_mode);
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            wgrad_descriptor.get_raw_desc(), CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &spatial_dim_count));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_pre_padding().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_post_padding().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_DILATIONS,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_dilation().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
+                                                       CUDNN_TYPE_INT64,
+                                                       spatial_dim_count,
+                                                       attributes.get_stride().data()));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(wgrad_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(wgrad_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 wgrad_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(wgrad_operation.initialize_managed_backend_pointer(
+            CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Conv_wgrad_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Conv_wgrad_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DW, Conv_wgrad_attributes::output_names::DW);
+        auto dw_desc = tensors.at(DW->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dw_desc));
+
+        auto conv_desc_ptr = wgrad_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &conv_desc_ptr));
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA,
+                                                       CUDNN_TYPE_FLOAT,
+                                                       1,
+                                                       &alpha));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(wgrad_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA,
+                                                       CUDNN_TYPE_FLOAT,
+                                                       1,
+                                                       &beta));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(wgrad_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(wgrad_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "CONV_WGRAD"})"_json);
+    }
+#endif
+};
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/dbn.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/dbn.h
new file mode 100644
index 00000000..9f2a23e7
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/dbn.h
@@ -0,0 +1,206 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class DBNNode : public NodeCRTP<DBNNode> {
+   public:
+    Batchnorm_backward_attributes attributes;
+
+    DBNNode(Batchnorm_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DBN;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for DBN node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferencing from X works today.
+        auto X                  = attributes.inputs[Batchnorm_backward_attributes::input_names::X];
+        auto const x_tensor_dim = X->get_dim();
+
+        auto DX            = attributes.outputs[Batchnorm_backward_attributes::output_names::DX];
+        auto dx_tensor_dim = DX->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (dx_tensor_dim.empty()) {
+            dx_tensor_dim.resize(x_tensor_dim.size());
+            DX->set_dim(x_tensor_dim);
+        }
+        if (DX->get_stride().empty()) {
+            auto const& DX_dim = DX->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DX_dim.size());
+            DX->set_stride(detail::generate_stride(DX_dim, stride_order));
+        }
+
+        // Set channel length tensors
+        auto infer_per_channel_tensors = [&x_tensor_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                tensor_dim.resize(x_tensor_dim.size(), 1);
+                tensor_dim[1] = x_tensor_dim[1];
+                T->set_dim(tensor_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+        infer_per_channel_tensors(attributes.outputs[Batchnorm_backward_attributes::output_names::DSCALE]);
+        infer_per_channel_tensors(attributes.outputs[Batchnorm_backward_attributes::output_names::DBIAS]);
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building DBNNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 dbn_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            dbn_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR));
+
+        // Set norm mode to BATCH_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::BATCH_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Batchnorm_backward_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set DY tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Batchnorm_backward_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        // Set scale tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Batchnorm_backward_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set mean and inv_variance tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Batchnorm_backward_attributes::input_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Batchnorm_backward_attributes::input_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set DSCALE and DBIAS output tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Batchnorm_backward_attributes::output_names::DSCALE);
+        auto dscale_desc = tensors.at(DSCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dscale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Batchnorm_backward_attributes::output_names::DBIAS);
+        auto dbias_desc = tensors.at(DBIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dbias_desc));
+
+        // Set DX output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Batchnorm_backward_attributes::output_names::DX);
+        auto dx_desc = tensors.at(DX->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dx_desc));
+
+        // Set peer stat tensors if any
+        if (!attributes.peer_stats.empty()) {
+            std::vector<cudnnBackendDescriptor_t> peer_stat_descs;
+            for (auto const& peer_stat : attributes.peer_stats) {
+                peer_stat_descs.push_back(tensors.at(peer_stat->get_uid())->get_raw_desc());
+            }
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dbn_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           peer_stat_descs.size(),
+                                                           peer_stat_descs.data()));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(dbn_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(dbn_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "DBN"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/dbn_weight.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/dbn_weight.h
new file mode 100644
index 00000000..bfad0dfb
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/dbn_weight.h
@@ -0,0 +1,215 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class DBNWeightNode : public NodeCRTP<DBNWeightNode> {
+   public:
+    DBN_weight_attributes attributes;
+
+    DBNWeightNode(DBN_weight_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DBN_WEIGHT;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for batchnorm finalize node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferencing from DY works today.
+        auto DY                  = attributes.inputs[DBN_weight_attributes::input_names::DY];
+        auto const dy_tensor_dim = DY->get_dim();
+
+        auto X            = attributes.inputs[DBN_weight_attributes::input_names::X];
+        auto x_tensor_dim = X->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (x_tensor_dim.empty()) {
+            x_tensor_dim.resize(dy_tensor_dim.size());
+            X->set_dim(dy_tensor_dim);
+        }
+        if (X->get_stride().empty()) {
+            auto const& X_dim = X->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(X_dim.size());
+            X->set_stride(detail::generate_stride(X_dim, stride_order));
+        }
+
+        // Set channel length tensors
+        auto infer_per_channel_tensors = [&dy_tensor_dim](std::shared_ptr<Tensor_attributes> const& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (T->get_dim().empty()) {
+                tensor_dim.resize(dy_tensor_dim.size(), 1);
+                tensor_dim[1] = dy_tensor_dim[1];
+                T->set_dim(tensor_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+        infer_per_channel_tensors(attributes.outputs[DBN_weight_attributes::output_names::DBIAS]);
+        infer_per_channel_tensors(attributes.outputs[DBN_weight_attributes::output_names::DSCALE]);
+        infer_per_channel_tensors(attributes.outputs[DBN_weight_attributes::output_names::EQ_BIAS]);
+        infer_per_channel_tensors(attributes.outputs[DBN_weight_attributes::output_names::EQ_SCALE_DY]);
+        infer_per_channel_tensors(attributes.outputs[DBN_weight_attributes::output_names::EQ_SCALE_X]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building DBNWeightNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 bn_bwd_weight_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(bn_bwd_weight_operation.initialize_managed_backend_pointer(
+            CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR));
+
+        // Set compute type (math precision)
+        cudnnDataType_t compute_type = CUDNN_DATA_FLOAT;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &compute_type));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, DBN_weight_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set DY tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, DBN_weight_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        // Set mean tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, DBN_weight_attributes::input_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        // Set inv_variance tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, DBN_weight_attributes::input_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set scale tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, DBN_weight_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set DSCALE output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, DBN_weight_attributes::output_names::DSCALE);
+        auto dscale_desc = tensors.at(DSCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dscale_desc));
+
+        // Set DBIAS output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, DBN_weight_attributes::output_names::DBIAS);
+        auto dbias_desc = tensors.at(DBIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dbias_desc));
+
+        // Set EQ_SCALE_DY output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_DY, DBN_weight_attributes::output_names::EQ_SCALE_DY);
+        auto eq_scale_dy_desc = tensors.at(EQ_SCALE_DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &eq_scale_dy_desc));
+
+        // Set EQ_SCALE_X output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_SCALE_X, DBN_weight_attributes::output_names::EQ_SCALE_X);
+        auto eq_scale_x_desc = tensors.at(EQ_SCALE_X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &eq_scale_x_desc));
+
+        // Set EQ_BIAS output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(EQ_BIAS, DBN_weight_attributes::output_names::EQ_BIAS);
+        auto eq_bias_desc = tensors.at(EQ_BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(bn_bwd_weight_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &eq_bias_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(bn_bwd_weight_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(bn_bwd_weight_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "DBN_WEIGHT"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/dln.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/dln.h
new file mode 100644
index 00000000..4dd508a5
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/dln.h
@@ -0,0 +1,227 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class DLNNode : public NodeCRTP<DLNNode> {
+   public:
+    Layernorm_backward_attributes attributes;
+
+    DLNNode(Layernorm_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DLN;
+    }
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for DLN node " << attributes.name);
+
+        // WAR as epsilon was required in previous versions
+        if (detail::get_backend_version() < 8906) {
+            attributes.inputs[Layernorm_backward_attributes::input_names::EPSILON] =
+                std::make_shared<Tensor_attributes>(0.0f);
+        }
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferencing from X works today.
+        auto X                  = attributes.inputs[Layernorm_backward_attributes::input_names::X];
+        auto const x_tensor_dim = X->get_dim();
+
+        auto DY            = attributes.inputs[Layernorm_backward_attributes::input_names::DY];
+        auto dy_tensor_dim = DY->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (dy_tensor_dim.empty()) {
+            dy_tensor_dim.resize(x_tensor_dim.size());
+            DY->set_dim(x_tensor_dim);
+        }
+        if (DY->get_stride().empty()) {
+            auto const& DY_dim = DY->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DY_dim.size());
+            DY->set_stride(detail::generate_stride(DY_dim, stride_order));
+        }
+
+        auto DX            = attributes.outputs[Layernorm_backward_attributes::output_names::DX];
+        auto dx_tensor_dim = DX->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (dx_tensor_dim.empty()) {
+            dx_tensor_dim.resize(x_tensor_dim.size());
+            DX->set_dim(x_tensor_dim);
+        }
+        if (DX->get_stride().empty()) {
+            auto const& DX_dim = DX->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DX_dim.size());
+            DX->set_stride(detail::generate_stride(DX_dim, stride_order));
+        }
+
+        auto scale_bias_dim = X->get_dim();
+        scale_bias_dim[0]   = 1;
+
+        // Set channel length tensors
+        auto infer_scale_bias_tensors = [&scale_bias_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                T->set_dim(scale_bias_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+
+        infer_scale_bias_tensors(attributes.outputs[Layernorm_backward_attributes::output_names::DSCALE]);
+        infer_scale_bias_tensors(attributes.outputs[Layernorm_backward_attributes::output_names::DBIAS]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building DLNNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 dln_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            dln_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR));
+
+        // Set norm mode to LAYER_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::LAYER_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_backward_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set DY tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Layernorm_backward_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        // Set scale tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_backward_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set mean and inv_variance tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Layernorm_backward_attributes::input_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Layernorm_backward_attributes::input_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set DSCALE and DBIAS output tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Layernorm_backward_attributes::output_names::DSCALE);
+        auto dscale_desc = tensors.at(DSCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dscale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Layernorm_backward_attributes::output_names::DBIAS);
+        auto dbias_desc = tensors.at(DBIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dbias_desc));
+
+        // Set DX output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Layernorm_backward_attributes::output_names::DX);
+        auto dx_desc = tensors.at(DX->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dx_desc));
+
+        // Set epsilon tensor for older backend versions
+        if (detail::get_backend_version() < 8906) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_backward_attributes::input_names::EPSILON);
+            auto epsilon_desc = tensors.at(EPSILON->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(dln_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &epsilon_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(dln_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(dln_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "LAYER_NORM_BPROP"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/genstats.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/genstats.h
new file mode 100644
index 00000000..8f918975
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/genstats.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class GenstatsNode : public NodeCRTP<GenstatsNode> {
+   public:
+    Genstats_attributes attributes;
+
+    GenstatsNode(Genstats_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::GENSTATS;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        attributes.fill_from_context(context);
+
+        // Only inferrencing from X works today.
+        auto X      = attributes.inputs[Genstats_attributes::input_names::X];
+        auto SUM    = attributes.outputs[Genstats_attributes::output_names::SUM];
+        auto SQ_SUM = attributes.outputs[Genstats_attributes::output_names::SQ_SUM];
+
+        auto const x_tensor_dim = X->get_dim();
+        auto sum_tensor_dim     = SUM->get_dim();
+        auto sq_sum_tensor_dim  = SQ_SUM->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (sum_tensor_dim.empty()) {
+            sum_tensor_dim.resize(x_tensor_dim.size(), 1);
+            sum_tensor_dim[1] = x_tensor_dim[1];
+            SUM->set_dim(sum_tensor_dim);
+        }
+        if (SUM->get_stride().empty()) {
+            auto const& SUM_dim = SUM->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(SUM_dim.size());
+            SUM->set_stride(detail::generate_stride(SUM_dim, stride_order));
+        }
+
+        // Only infer dims and strides if user did not set them
+        if (sq_sum_tensor_dim.empty()) {
+            sq_sum_tensor_dim.resize(x_tensor_dim.size(), 1);
+            sq_sum_tensor_dim[1] = x_tensor_dim[1];
+            SQ_SUM->set_dim(sq_sum_tensor_dim);
+        }
+        if (SQ_SUM->get_stride().empty()) {
+            auto const& SQ_SUM_dim = SQ_SUM->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(SQ_SUM_dim.size());
+            SQ_SUM->set_stride(detail::generate_stride(SQ_SUM_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building GenstatsNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 genstats_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            genstats_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Genstats_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(genstats_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_GENSTATS_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set gen stats mode
+        cudnnGenStatsMode_t genstats_mode = CUDNN_GENSTATS_SUM_SQSUM;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(genstats_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_GENSTATS_MODE,
+                                                       CUDNN_TYPE_GENSTATS_MODE,
+                                                       1,
+                                                       &genstats_mode));
+
+        // Set math precision based on X tensor data type
+        cudnnDataType_t math_prec = static_cast<cudnnDataType_t>(tensors.at(X->second->get_uid())->getDataType());
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(genstats_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &math_prec));
+
+        // Set SUM output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SUM, Genstats_attributes::output_names::SUM);
+        auto sum_desc = tensors.at(SUM->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(genstats_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &sum_desc));
+
+        // Set SQ_SUM output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(SQ_SUM, Genstats_attributes::output_names::SQ_SUM);
+        auto sq_sum_desc = tensors.at(SQ_SUM->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(genstats_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &sq_sum_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(genstats_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(genstats_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "GENSTATS"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/instancenorm.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/instancenorm.h
new file mode 100644
index 00000000..1b71f4ab
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/instancenorm.h
@@ -0,0 +1,414 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+class InstanceNormNode : public NodeCRTP<InstanceNormNode> {
+   public:
+    Instancenorm_attributes attributes;
+
+    InstanceNormNode(Instancenorm_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::INSTANCENORM;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for instancenorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto X = attributes.inputs[Instancenorm_attributes::input_names::X];
+        auto Y = attributes.outputs[Instancenorm_attributes::output_names::Y];
+
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        // mean inv_var dim is n,c,1,1
+        auto stats_dim = X->get_dim();
+        for (size_t i = 2; i < stats_dim.size(); i++) {
+            stats_dim[i] = 1;
+        }
+
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            auto mean = attributes.outputs[Instancenorm_attributes::output_names::MEAN];
+            if (mean->get_dim().empty()) {
+                mean->set_dim(stats_dim);
+            }
+            if (mean->get_stride().empty()) {
+                auto const& mean_dim = mean->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(mean_dim.size());
+                mean->set_stride(detail::generate_stride(mean_dim, stride_order));
+            }
+
+            auto inv_var = attributes.outputs[Instancenorm_attributes::output_names::INV_VARIANCE];
+            if (inv_var->get_dim().empty()) {
+                inv_var->set_dim(stats_dim);
+            }
+            if (inv_var->get_stride().empty()) {
+                auto const& inv_var_dim = inv_var->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(inv_var_dim.size());
+                inv_var->set_stride(detail::generate_stride(inv_var_dim, stride_order));
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating InstanceNormNode " << attributes.name);
+
+        // Norm forward phase should be set
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.forward_phase == NormFwdPhase_t::NOT_SET,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Forward phase not set of instancenorm node.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building InstanceNormNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 instancenorm_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            instancenorm_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR));
+
+        // Set norm mode to INSTANCE_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::INSTANCE_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set forward phase
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.forward_phase, cudnn_norm_fwd_phase));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                                       1,
+                                                       &cudnn_norm_fwd_phase));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set scale and bias tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Instancenorm_attributes::input_names::BIAS);
+        auto bias_desc = tensors.at(BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &bias_desc));
+
+        // Set epsilon tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Instancenorm_attributes::input_names::EPSILON);
+        auto epsilon_desc = tensors.at(EPSILON->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &epsilon_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Instancenorm_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        // Set mean and inv_variance for training phase
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Instancenorm_attributes::output_names::MEAN);
+            auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &mean_desc));
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE,
+                                                       Instancenorm_attributes::output_names::INV_VARIANCE);
+            auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(instancenorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &inv_var_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(instancenorm_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(instancenorm_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "INSTANCE_NORM"})"_json);
+    }
+#endif
+};
+
+class DINNode : public NodeCRTP<DINNode> {
+   public:
+    Instancenorm_backward_attributes attributes;
+
+    DINNode(Instancenorm_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DIN;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for DIN node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferencing from X works today.
+        auto X                  = attributes.inputs[Instancenorm_backward_attributes::input_names::X];
+        auto const x_tensor_dim = X->get_dim();
+
+        auto DY            = attributes.inputs[Instancenorm_backward_attributes::input_names::DY];
+        auto dy_tensor_dim = DY->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (dy_tensor_dim.empty()) {
+            dy_tensor_dim.resize(x_tensor_dim.size());
+            DY->set_dim(x_tensor_dim);
+        }
+        if (DY->get_stride().empty()) {
+            auto const& DY_dim = DY->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DY_dim.size());
+            DY->set_stride(detail::generate_stride(DY_dim, stride_order));
+        }
+
+        auto DX            = attributes.outputs[Instancenorm_backward_attributes::output_names::DX];
+        auto dx_tensor_dim = DX->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (dx_tensor_dim.empty()) {
+            dx_tensor_dim.resize(x_tensor_dim.size());
+            DX->set_dim(x_tensor_dim);
+        }
+        if (DX->get_stride().empty()) {
+            auto const& DX_dim = DX->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DX_dim.size());
+            DX->set_stride(detail::generate_stride(DX_dim, stride_order));
+        }
+
+        // scale_bias   dim is 1,c,1,1
+        // mean inv_var dim is n,c,1,1
+        auto scale_bias_dim = X->get_dim();
+        for (size_t i = 0; i < scale_bias_dim.size(); i++) {
+            if (i != 1) {
+                scale_bias_dim[i] = 1;
+            }
+        }
+
+        // Set channel length tensors
+        auto infer_scale_bias_tensors = [&scale_bias_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                T->set_dim(scale_bias_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+
+        infer_scale_bias_tensors(attributes.outputs[Instancenorm_backward_attributes::output_names::DSCALE]);
+        infer_scale_bias_tensors(attributes.outputs[Instancenorm_backward_attributes::output_names::DBIAS]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building DINNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 din_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            din_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR));
+
+        // Set norm mode to INSTANCE_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::INSTANCE_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Instancenorm_backward_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set DY tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Instancenorm_backward_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        // Set scale tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Instancenorm_backward_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set mean and inv_variance tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(MEAN, Instancenorm_backward_attributes::input_names::MEAN);
+        auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &mean_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE,
+                                                  Instancenorm_backward_attributes::input_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set DSCALE and DBIAS output tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Instancenorm_backward_attributes::output_names::DSCALE);
+        auto dscale_desc = tensors.at(DSCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dscale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Instancenorm_backward_attributes::output_names::DBIAS);
+        auto dbias_desc = tensors.at(DBIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dbias_desc));
+
+        // Set DX output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Instancenorm_backward_attributes::output_names::DX);
+        auto dx_desc = tensors.at(DX->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(din_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dx_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(din_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(din_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "INSTANCE_NORM_BPROP"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/layernorm.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/layernorm.h
new file mode 100644
index 00000000..46420aa2
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/layernorm.h
@@ -0,0 +1,259 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+class LayerNormNode : public NodeCRTP<LayerNormNode> {
+   public:
+    Layernorm_attributes attributes;
+
+    LayerNormNode(Layernorm_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::LAYERNORM;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for layernorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto X = attributes.inputs[Layernorm_attributes::input_names::X];
+        auto Y = attributes.outputs[Layernorm_attributes::output_names::Y];
+
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        // scale_bias dim is 1,c,h,w
+        auto scale_bias_dim = X->get_dim();
+        scale_bias_dim[0]   = 1;
+
+        auto scale = attributes.inputs[Layernorm_attributes::input_names::SCALE];
+        // Only infer dims and strides if user did not set them
+        if (scale->get_dim().empty()) {
+            scale->set_dim(scale_bias_dim);
+        }
+        if (scale->get_stride().empty()) {
+            auto const& scale_dim = scale->get_dim();
+            std::vector<int64_t> stride_order;
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::generate_stride_order_preserving_format(X->get_stride(), scale_dim.size(), stride_order));
+            scale->set_stride(detail::generate_stride(scale_dim, stride_order));
+        }
+
+        auto bias = attributes.inputs[Layernorm_attributes::input_names::BIAS];
+        // Only infer dims and strides if user did not set them
+        if (bias->get_dim().empty()) {
+            bias->set_dim(scale_bias_dim);
+        }
+        if (bias->get_stride().empty()) {
+            auto const& bias_dim = bias->get_dim();
+            std::vector<int64_t> stride_order;
+            CHECK_CUDNN_FRONTEND_ERROR(
+                detail::generate_stride_order_preserving_format(X->get_stride(), bias_dim.size(), stride_order));
+            bias->set_stride(detail::generate_stride(bias_dim, stride_order));
+        }
+
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            // stats dim is x where scale == 1 else 1
+            auto stats_dim = X->get_dim();
+            for (size_t i = 0; i < stats_dim.size(); i++) {
+                if (scale->get_dim()[i] != 1) {
+                    stats_dim[i] = 1;
+                }
+            }
+
+            auto mean = attributes.outputs[Layernorm_attributes::output_names::MEAN];
+            // Only infer dims and strides if user did not set them
+            if (mean->get_dim().empty()) {
+                mean->set_dim(stats_dim);
+            }
+            if (mean->get_stride().empty()) {
+                auto const& mean_dim = mean->get_dim();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(X->get_stride(), mean_dim.size(), stride_order));
+                mean->set_stride(detail::generate_stride(mean_dim, stride_order));
+            }
+
+            auto inv_var = attributes.outputs[Layernorm_attributes::output_names::INV_VARIANCE];
+            // Only infer dims and strides if user did not set them
+            if (inv_var->get_dim().empty()) {
+                inv_var->set_dim(stats_dim);
+            }
+            if (inv_var->get_stride().empty()) {
+                auto const& inv_var_dim = inv_var->get_dim();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(X->get_stride(), inv_var_dim.size(), stride_order));
+                inv_var->set_stride(detail::generate_stride(inv_var_dim, stride_order));
+            }
+        }
+
+        // Set scalar tensors
+        std::vector<int64_t> ones(X->get_dim().size(), 1);
+        auto infer_scalar_tensors = [&ones](std::shared_ptr<Tensor_attributes>& T) {
+            // Only infer dims and strides if user did not set them
+            if (T->get_dim().empty()) {
+                T->set_dim(ones);
+            }
+            if (T->get_stride().empty()) {
+                T->set_stride(ones);
+            }
+        };
+        infer_scalar_tensors(attributes.inputs[Layernorm_attributes::input_names::EPSILON]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: " << "Validating LayerNormNode " << attributes.name);
+
+        // Norm forward phase should be set
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.forward_phase == NormFwdPhase_t::NOT_SET,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Forward phase not set of layernorm node.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building LayerNormNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 layernorm_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            layernorm_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR));
+
+        // Set norm mode
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::LAYER_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set forward phase
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.forward_phase, cudnn_norm_fwd_phase));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                                       1,
+                                                       &cudnn_norm_fwd_phase));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Layernorm_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set scale and bias tensors
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Layernorm_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(BIAS, Layernorm_attributes::input_names::BIAS);
+        auto bias_desc = tensors.at(BIAS->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &bias_desc));
+
+        // Set epsilon tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_attributes::input_names::EPSILON);
+        auto epsilon_desc = tensors.at(EPSILON->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &epsilon_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Layernorm_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        // Set mean and inv_variance for training phase
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(MEAN, Layernorm_attributes::output_names::MEAN);
+            auto mean_desc = tensors.at(MEAN->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &mean_desc));
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Layernorm_attributes::output_names::INV_VARIANCE);
+            auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(layernorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &inv_var_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(layernorm_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(layernorm_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "LAYER_NORM"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/matmul.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/matmul.h
new file mode 100644
index 00000000..f09d3415
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/matmul.h
@@ -0,0 +1,253 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class MatmulNode : public NodeCRTP<MatmulNode> {
+   public:
+    Matmul_attributes attributes;
+
+    MatmulNode(Matmul_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::MATMUL;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for matmul node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // Only inferrencing from (A, B) -> C works today.
+        auto a_tensor = attributes.inputs[Matmul_attributes::input_names::A];
+        auto b_tensor = attributes.inputs[Matmul_attributes::input_names::B];
+        auto c_tensor = attributes.outputs[Matmul_attributes::output_names::C];
+
+        auto const a_tensor_dim = a_tensor->get_dim();
+        auto const b_tensor_dim = b_tensor->get_dim();
+        auto c_tensor_dim       = c_tensor->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (c_tensor_dim.empty()) {
+            // CHECK_CUDNN_FRONTEND_ERROR(detail::generate_matmul_output_dim(a_tensor_dim, b_tensor_dim, c_tensor_dim));
+
+            c_tensor_dim.resize(a_tensor_dim.size());
+            int64_t gemm_start_dim           = a_tensor_dim.size() - 2;
+            c_tensor_dim[gemm_start_dim]     = a_tensor_dim[gemm_start_dim];      // M
+            c_tensor_dim[gemm_start_dim + 1] = b_tensor_dim[gemm_start_dim + 1];  // N
+
+            // Broadcast the batches
+            for (int64_t i = 0; i < gemm_start_dim; ++i) {
+                c_tensor_dim[i] = std::max(a_tensor_dim[i], b_tensor_dim[i]);
+            }
+
+            c_tensor->set_dim(c_tensor_dim);
+        }
+        if (c_tensor->get_stride().empty()) {
+            auto const& c_dim = c_tensor->get_dim();
+            // Default to Col major
+            auto const& stride_order = detail::generate_row_major_stride_order(c_dim.size());
+            c_tensor->set_stride(detail::generate_stride(c_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building MatmulNode operations " << attributes.name << " ");
+
+        // Create matmul descriptor by directly calling cuDNN backend API
+        MatMulDesc_v8 matmul_descriptor;
+
+        _CUDNN_CHECK_CUDNN_ERROR(matmul_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_MATMUL_DESCRIPTOR));
+
+        // Set compute type
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            matmul_descriptor.get_raw_desc(), CUDNN_ATTR_MATMUL_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &cudnn_data_type));
+
+        // Set padding value if specified
+#if (CUDNN_VERSION >= 8900)
+        if (attributes.padding_value != 0.0) {
+            double padding_value = attributes.padding_value;
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_MATMUL_PADDING_VALUE,
+                                                           CUDNN_TYPE_DOUBLE,
+                                                           1,
+                                                           &padding_value));
+        }
+#endif
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(matmul_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(matmul_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 matmul_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            matmul_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR));
+
+        // Set input tensor A
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(A, Matmul_attributes::input_names::A);
+        auto a_desc = tensors.at(A->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_MATMUL_ADESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &a_desc));
+
+        // Set input tensor B
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(B, Matmul_attributes::input_names::B);
+        auto b_desc = tensors.at(B->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_MATMUL_BDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &b_desc));
+
+        // Set output tensor C
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(C, Matmul_attributes::output_names::C);
+        auto c_desc = tensors.at(C->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_MATMUL_CDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &c_desc));
+
+        // Set matmul descriptor
+        auto matmul_desc_ptr = matmul_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_MATMUL_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &matmul_desc_ptr));
+
+        // Set optional override tensors
+        auto M_override = attributes.inputs.find(Matmul_attributes::input_names::M_override);
+        if ((M_override != attributes.inputs.end()) && (M_override->second != nullptr)) {
+            auto m_override_desc = tensors.at(M_override->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &m_override_desc));
+        }
+
+        auto N_override = attributes.inputs.find(Matmul_attributes::input_names::N_override);
+        if ((N_override != attributes.inputs.end()) && (N_override->second != nullptr)) {
+            auto n_override_desc = tensors.at(N_override->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &n_override_desc));
+        }
+
+        auto K_override = attributes.inputs.find(Matmul_attributes::input_names::K_override);
+        if ((K_override != attributes.inputs.end()) && (K_override->second != nullptr)) {
+            auto k_override_desc = tensors.at(K_override->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(matmul_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &k_override_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(matmul_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(matmul_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "MATMUL"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::matmul(std::shared_ptr<Tensor_attributes> a,
+              std::shared_ptr<Tensor_attributes> b,
+              Matmul_attributes attributes,
+              std::shared_ptr<Tensor_attributes> c) {
+    attributes.inputs[Matmul_attributes::input_names::A]   = a;
+    attributes.inputs[Matmul_attributes::input_names::B]   = b;
+    attributes.outputs[Matmul_attributes::output_names::C] = c;
+    sub_nodes.emplace_back(std::make_unique<MatmulNode>(std::move(attributes), context));
+}
+
+inline std::shared_ptr<Tensor_attributes>
+INode::matmul(std::shared_ptr<Tensor_attributes> a,
+              std::shared_ptr<Tensor_attributes> b,
+              Matmul_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    attributes.inputs[Matmul_attributes::input_names::A] = a;
+    attributes.inputs[Matmul_attributes::input_names::B] = b;
+
+    if (a->get_name().empty()) {
+        a->set_name(attributes.name + "::A");
+    };
+    if (b->get_name().empty()) {
+        b->set_name(attributes.name + "::B");
+    };
+
+    auto m_override = attributes.inputs.find(Matmul_attributes::input_names::M_override);
+    auto n_override = attributes.inputs.find(Matmul_attributes::input_names::N_override);
+    auto k_override = attributes.inputs.find(Matmul_attributes::input_names::K_override);
+
+    if (m_override != attributes.inputs.end()) {
+        auto tensor = m_override->second;
+        if (tensor && tensor->get_name().empty()) {
+            tensor->set_name(attributes.name + "::M_override");
+        }
+    }
+    if (n_override != attributes.inputs.end()) {
+        auto tensor = n_override->second;
+        if (tensor && tensor->get_name().empty()) {
+            tensor->set_name(attributes.name + "::N_override");
+        }
+    }
+    if (k_override != attributes.inputs.end()) {
+        auto tensor = k_override->second;
+        if (tensor && tensor->get_name().empty()) {
+            tensor->set_name(attributes.name + "::K_override");
+        }
+    }
+
+    auto C = attributes.outputs[Matmul_attributes::output_names::C] = output_tensor(attributes.name + "::C");
+
+    sub_nodes.emplace_back(std::make_unique<MatmulNode>(std::move(attributes), context));
+    return C;
+}
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/matmul_fp8.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/matmul_fp8.h
new file mode 100644
index 00000000..d3fe5e58
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/matmul_fp8.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class MatmulFP8Node : public NodeCRTP<MatmulFP8Node> {
+   public:
+    Matmul_fp8_attributes attributes;
+
+    MatmulFP8Node(Matmul_fp8_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::MATMUL;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for matmul fp8 node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto const& a_dim = attributes.inputs.at(Matmul_fp8_attributes::input_names::A)->get_dim();
+        auto const& b_dim = attributes.inputs.at(Matmul_fp8_attributes::input_names::B)->get_dim();
+        auto const& c_dim = attributes.outputs.at(Matmul_fp8_attributes::output_names::C)->get_dim();
+
+        std::shared_ptr<Tensor_attributes> last_output;
+
+        // Matmul
+
+        auto matmul_attributes = Matmul_attributes();
+        matmul_attributes.clone_fp8_attributes(attributes);
+        matmul_attributes.set_name("matmul");
+
+        last_output = matmul(attributes.inputs.at(Matmul_fp8_attributes::input_names::A),
+                             attributes.inputs.at(Matmul_fp8_attributes::input_names::B),
+                             matmul_attributes);
+
+        // Reduction if GQA for head dimension
+        if (a_dim.size() == 4 && b_dim.size() == 4 && c_dim.size() == 4 && a_dim[1] == b_dim[1] &&
+            a_dim[1] != c_dim[1] && (a_dim[1] % c_dim[1] == 0)) {
+            auto gqa_attributes = Reduction_attributes().set_name("gqa_c").set_mode(ReductionMode_t::ADD);
+            last_output         = reduction(last_output, gqa_attributes);
+            last_output->set_dim(c_dim);
+        }
+
+        //// Scale Descales
+        auto mul_attributes = Pointwise_attributes().set_mode(PointwiseMode_t::MUL);
+        // Descale A
+        mul_attributes.set_name("descale_a");
+        last_output =
+            pointwise(last_output, attributes.inputs.at(Matmul_fp8_attributes::input_names::Descale_A), mul_attributes);
+
+        // Descale B
+        mul_attributes.set_name("descale_b");
+        last_output =
+            pointwise(last_output, attributes.inputs.at(Matmul_fp8_attributes::input_names::Descale_B), mul_attributes);
+
+        // Scale C
+        mul_attributes.set_name("scale_c");
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        pointwise(last_output,
+                  attributes.inputs.at(Matmul_fp8_attributes::input_names::Scale_C),
+                  mul_attributes,
+                  attributes.outputs.at(Matmul_fp8_attributes::output_names::C));
+
+        // Amax C
+        auto amax_attributes = Reduction_attributes().set_name("amax_c").set_mode(ReductionMode_t::AMAX);
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        reduction(last_output, amax_attributes, attributes.outputs.at(Matmul_fp8_attributes::output_names::Amax_C));
+
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "MATMUL_FP8"})"_json);
+    }
+#endif
+};
+inline void
+INode::matmul_fp8(std::shared_ptr<Tensor_attributes> a,
+                  std::shared_ptr<Tensor_attributes> b,
+                  std::shared_ptr<Tensor_attributes> descale_a,
+                  std::shared_ptr<Tensor_attributes> descale_b,
+                  std::shared_ptr<Tensor_attributes> scale_c,
+                  Matmul_fp8_attributes attributes,
+                  std::shared_ptr<Tensor_attributes> c,
+                  std::shared_ptr<Tensor_attributes> amax_c) {
+    attributes.inputs[Matmul_fp8_attributes::input_names::A]         = a;
+    attributes.inputs[Matmul_fp8_attributes::input_names::B]         = b;
+    attributes.inputs[Matmul_fp8_attributes::input_names::Descale_A] = descale_a;
+    attributes.inputs[Matmul_fp8_attributes::input_names::Descale_B] = descale_b;
+    attributes.inputs[Matmul_fp8_attributes::input_names::Scale_C]   = scale_c;
+    attributes.outputs[Matmul_fp8_attributes::output_names::C]       = c;
+    attributes.outputs[Matmul_fp8_attributes::output_names::Amax_C]  = amax_c;
+    sub_nodes.emplace_back(std::make_unique<MatmulFP8Node>(std::move(attributes), context));
+}
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/moe_grouped_matmul.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/moe_grouped_matmul.h
new file mode 100644
index 00000000..e12acde0
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/moe_grouped_matmul.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class MoeGroupedMatmulNode : public NodeCRTP<MoeGroupedMatmulNode> {
+   public:
+    Moe_grouped_matmul_attributes attributes;
+
+    MoeGroupedMatmulNode(Moe_grouped_matmul_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::MOE_GROUPED_MATMUL;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for moe grouped matmul node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto token_tensor       = attributes.inputs[Moe_grouped_matmul_attributes::input_names::Token];
+        auto weight_tensor      = attributes.inputs[Moe_grouped_matmul_attributes::input_names::Weight];
+        auto token_index_tensor = attributes.inputs[Moe_grouped_matmul_attributes::input_names::TokenIndex];
+        auto output_tensor      = attributes.outputs[Moe_grouped_matmul_attributes::output_names::Output];
+
+        auto const token_tensor_dim  = token_tensor->get_dim();
+        auto const weight_tensor_dim = weight_tensor->get_dim();
+        auto output_tensor_dim       = output_tensor->get_dim();
+
+        if (output_tensor_dim.empty()) {
+            output_tensor_dim.resize(3);
+            output_tensor_dim[0] = 1;
+            output_tensor_dim[2] = weight_tensor_dim[2];
+            if (attributes.mode == MoeGroupedMatmulMode_t::GATHER) {
+                output_tensor_dim[1] = token_index_tensor->get_dim()[1];
+            } else {
+                output_tensor_dim[1] = token_tensor_dim[1];
+            }
+            output_tensor_dim.resize(3);
+
+            output_tensor->set_dim(output_tensor_dim);
+        }
+
+        if (output_tensor->get_stride().empty()) {
+            auto const& output_dim   = output_tensor->get_dim();
+            auto const& stride_order = detail::generate_row_major_stride_order(output_dim.size());
+            output_tensor->set_stride(detail::generate_stride(output_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: "
+                    << "Building MoeGroupedMatmulNode operations " << attributes.name << std::endl;
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Moe grouped matmul requires cuDNN v9.15.0"};
+
+#if (CUDNN_VERSION >= 91500)
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91500, cudnn_ver_error);
+        CUDNN_FRONTEND_UNUSED(operations);
+
+        auto moe_grouped_matmul_operation =
+            make_shared_backend_pointer(CUDNN_BACKEND_OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR);
+
+        cudnnMoeGroupedMatmulMode_t moe_grouped_matmul_mode;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.mode, moe_grouped_matmul_mode));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_MODE,
+                                                       CUDNN_TYPE_MOE_GROUPED_MATMUL_MODE,
+                                                       1,
+                                                       &moe_grouped_matmul_mode));
+
+        cudnnDataType_t cudnn_data_type;
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        auto token         = attributes.inputs.find(Moe_grouped_matmul_attributes::input_names::Token)->second;
+        auto backend_token = tensors[token->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_TOKEN_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_token));
+
+        auto weight         = attributes.inputs.find(Moe_grouped_matmul_attributes::input_names::Weight)->second;
+        auto backend_weight = tensors[weight->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_WEIGHT_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_weight));
+
+        auto first_token_offset =
+            attributes.inputs.find(Moe_grouped_matmul_attributes::input_names::FirstTokenOffset)->second;
+        auto backend_first_token_offset = tensors[first_token_offset->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_FIRST_TOKEN_OFFSET_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_first_token_offset));
+
+        auto output         = attributes.outputs.find(Moe_grouped_matmul_attributes::output_names::Output)->second;
+        auto backend_output = tensors[output->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_OUTPUT_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_output));
+
+        if (attributes.mode == MoeGroupedMatmulMode_t::GATHER || attributes.mode == MoeGroupedMatmulMode_t::SCATTER) {
+            auto token_index = attributes.inputs.find(Moe_grouped_matmul_attributes::input_names::TokenIndex)->second;
+            auto backend_token_index = tensors[token_index->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_TOKEN_INDEX_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_token_index));
+        }
+
+        if (attributes.mode == MoeGroupedMatmulMode_t::SCATTER) {
+            auto token_ks         = attributes.inputs.find(Moe_grouped_matmul_attributes::input_names::TokenKs)->second;
+            auto backend_token_ks = tensors[token_ks->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_TOKEN_KS_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_token_ks));
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(moe_grouped_matmul_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_MOE_GROUPED_MATMUL_TOP_K,
+                                                           CUDNN_TYPE_INT32,
+                                                           1,
+                                                           &(attributes.top_k)));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(moe_grouped_matmul_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(moe_grouped_matmul_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "MOE_GROUPED_MATMUL"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::moe_grouped_matmul(std::shared_ptr<Tensor_attributes> token,
+                          std::shared_ptr<Tensor_attributes> weight,
+                          std::shared_ptr<Tensor_attributes> first_token_offset,
+                          std::shared_ptr<Tensor_attributes> token_index,
+                          std::shared_ptr<Tensor_attributes> token_ks,
+                          Moe_grouped_matmul_attributes attributes,
+                          std::shared_ptr<Tensor_attributes> output) {
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::Token]            = token;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::Weight]           = weight;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::FirstTokenOffset] = first_token_offset;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::TokenIndex]       = token_index;
+    attributes.inputs[Moe_grouped_matmul_attributes::input_names::TokenKs]          = token_ks;
+    attributes.outputs[Moe_grouped_matmul_attributes::output_names::Output]         = output;
+    sub_nodes.emplace_back(std::make_unique<MoeGroupedMatmulNode>(std::move(attributes), context));
+}
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/paged_cache_load.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/paged_cache_load.h
new file mode 100644
index 00000000..9fc86109
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/paged_cache_load.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+#include "pointwise.h"
+#include "reduction.h"
+
+namespace cudnn_frontend::graph {
+
+class PagedCacheLoadNode : public NodeCRTP<PagedCacheLoadNode> {
+   public:
+    PagedCacheLoad_attributes attributes;
+
+    PagedCacheLoadNode(PagedCacheLoad_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::PAGED_CACHE_LOAD;
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building PagedCacheLoadNode operations " << attributes.name << " ");
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Paged cache load requires cuDNN v9.5.0"};
+
+#if (CUDNN_VERSION >= 90500)
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90500, cudnn_ver_error);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 paged_cache_load_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(paged_cache_load_operation.initialize_managed_backend_pointer(
+            CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR));
+
+        // Set container tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(container, PagedCacheLoad_attributes::input_names::container);
+        auto container_desc = tensors.at(container->second->get_uid())->get_raw_desc();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(paged_cache_load_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &container_desc));
+
+        // Set page table tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(pageTable, PagedCacheLoad_attributes::input_names::pageTable);
+        auto page_table_desc = tensors.at(pageTable->second->get_uid())->get_raw_desc();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(paged_cache_load_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &page_table_desc));
+
+        // Set sequence length tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(seqLen, PagedCacheLoad_attributes::input_names::seqLen);
+        auto seq_len_desc = tensors.at(seqLen->second->get_uid())->get_raw_desc();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(paged_cache_load_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &seq_len_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(yOut, PagedCacheLoad_attributes::output_names::yOut);
+        auto y_desc = tensors.at(yOut->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(paged_cache_load_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(paged_cache_load_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(paged_cache_load_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating PagedCacheLoadNode " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90500 || detail::get_compiled_version() < 90500,
+                                       error_code_t::CUDNN_BACKEND_API_FAILED,
+                                       "The cuDNN backend version must be at least 9.5.0 at compile time and runtime "
+                                       "in order to use PagedCacheLoadNode.");
+
+        auto const yOut_dims       = attributes.outputs.at(PagedCacheLoad_attributes::output_names::yOut)->get_dim();
+        auto const yOut_strides    = attributes.outputs.at(PagedCacheLoad_attributes::output_names::yOut)->get_stride();
+        auto const container_dims  = attributes.inputs.at(PagedCacheLoad_attributes::input_names::container)->get_dim();
+        auto const blockTable_dims = attributes.inputs.at(PagedCacheLoad_attributes::input_names::pageTable)->get_dim();
+
+        // In the backend, the k-cache is passed as K^T and has dims [B,H,D,S], while v-cache has dims [B,H,S,D]
+        // Use the strides to distinguish.
+        auto yIsTransposed = yOut_strides[2] == 1;
+        auto s_kv          = !yIsTransposed ? yOut_dims[2] : yOut_dims[3];
+
+        auto block_size       = container_dims[2];
+        auto block_table_size = blockTable_dims[2];
+        bool is_block_table_packed =
+            attributes.inputs.at(PagedCacheLoad_attributes::input_names::pageTable)->get_ragged_offset() != nullptr;
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            !is_block_table_packed && (s_kv + (block_size - 1)) / block_size != block_table_size,
+            error_code_t::INVALID_VALUE,
+            "Paged cache load: block table size must equal ceil(s_kv/block_size), except when using packed block "
+            "tables");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+    }
+#endif
+};
+
+inline void
+INode::paged_cache_load(std::shared_ptr<Tensor_attributes> container,
+                        std::shared_ptr<Tensor_attributes> seqLen,
+                        std::shared_ptr<Tensor_attributes> pageTable,
+                        PagedCacheLoad_attributes attributes,
+                        std::shared_ptr<Tensor_attributes> yOut) {
+    attributes.inputs[PagedCacheLoad_attributes::input_names::container] = std::move(container);
+    attributes.inputs[PagedCacheLoad_attributes::input_names::seqLen]    = std::move(seqLen);
+    attributes.inputs[PagedCacheLoad_attributes::input_names::pageTable] = std::move(pageTable);
+    attributes.outputs[PagedCacheLoad_attributes::output_names::yOut]    = std::move(yOut);
+    sub_nodes.emplace_back(std::make_unique<PagedCacheLoadNode>(std::move(attributes), context));
+}
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/pointwise.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/pointwise.h
new file mode 100644
index 00000000..d67ab6a3
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/pointwise.h
@@ -0,0 +1,377 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class PointwiseNode : public NodeCRTP<PointwiseNode> {
+   public:
+    Pointwise_attributes attributes;
+
+    PointwiseNode(Pointwise_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::POINTWISE;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for pointwise node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto out_0_tensor = attributes.outputs.at(Pointwise_attributes::output_names::OUT_0);
+
+        auto output_dim = out_0_tensor->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (output_dim.empty()) {
+            std::vector<std::vector<int64_t>> input_shapes;
+            for (const auto& [input_name, input_tensor] : attributes.inputs) {
+                if (!input_tensor) {
+                    continue;
+                }
+                input_shapes.push_back(input_tensor->get_dim());
+            }
+
+            CHECK_CUDNN_FRONTEND_ERROR(detail::compute_broadcast_shape(input_shapes, output_dim));
+            out_0_tensor->set_dim(output_dim);
+        }
+
+        if (out_0_tensor->get_stride().empty()) {
+            for (const auto& [input_name, input_tensor] : attributes.inputs) {
+                if (input_tensor == nullptr) {
+                    continue;
+                }
+                if (input_tensor->get_dim() == out_0_tensor->get_dim()) {
+                    CUDNN_FE_LOG_LABEL_ENDL("INFO:" << "        " << out_0_tensor->get_name()
+                                                    << " stride computed from " << input_tensor->get_name());
+                    out_0_tensor->set_stride(input_tensor->get_stride());
+                    break;
+                }
+            }
+            if (out_0_tensor->get_stride().empty() && out_0_tensor->get_is_virtual()) {
+                // If the tensor is virtual the strides are immaterial
+                auto input_stride = attributes.inputs.at(Pointwise_attributes::input_names::IN_0)->get_stride();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(input_stride, output_dim.size(), stride_order));
+                out_0_tensor->set_stride(detail::generate_stride(output_dim, stride_order));
+            }
+            RETURN_CUDNN_FRONTEND_ERROR_IF(out_0_tensor->get_stride().empty(),
+                                           error_code_t::SHAPE_DEDUCTION_FAILED,
+                                           "Pointwise output strides could not be computed");
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building PointwiseNode operations " << attributes.name << " ");
+
+        // Create pointwise descriptor by directly calling cuDNN backend API
+        PointWiseDesc_v8 pointwise_descriptor;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            pointwise_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_POINTWISE_DESCRIPTOR));
+
+        // Set pointwise mode
+        cudnnPointwiseMode_t cudnn_pointwise_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.mode, cudnn_pointwise_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_POINTWISE_MODE,
+                                                       CUDNN_TYPE_POINTWISE_MODE,
+                                                       1,
+                                                       &cudnn_pointwise_mode));
+
+        // Set compute type
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_POINTWISE_MATH_PREC,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        // Set mode-specific attributes
+        if (attributes.mode == PointwiseMode_t::RELU_FWD || attributes.mode == PointwiseMode_t::RELU_BWD) {
+            cudnnNanPropagation_t nan_propagation = CUDNN_PROPAGATE_NAN;
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_POINTWISE_NAN_PROPAGATION,
+                                                           CUDNN_TYPE_NAN_PROPOGATION,
+                                                           1,
+                                                           &nan_propagation));
+
+            double lower_clip = attributes.relu_lower_clip.value_or(0.0);
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP,
+                                                           CUDNN_TYPE_DOUBLE,
+                                                           1,
+                                                           &lower_clip));
+
+            double upper_clip = attributes.relu_upper_clip.value_or(std::numeric_limits<double>::max());
+            if (attributes.compute_data_type == DataType_t::FLOAT) {
+                upper_clip = std::min<double>(upper_clip, std::numeric_limits<float>::max());
+            }
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
+                                                           CUDNN_TYPE_DOUBLE,
+                                                           1,
+                                                           &upper_clip));
+
+            double lower_clip_slope = attributes.relu_lower_clip_slope.value_or(0.0);
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE,
+                                                           CUDNN_TYPE_DOUBLE,
+                                                           1,
+                                                           &lower_clip_slope));
+        } else if (attributes.mode == PointwiseMode_t::ELU_FWD || attributes.mode == PointwiseMode_t::ELU_BWD) {
+            double elu_alpha = attributes.elu_alpha.value_or(1.0);
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+                pointwise_descriptor.get_raw_desc(), CUDNN_ATTR_POINTWISE_ELU_ALPHA, CUDNN_TYPE_DOUBLE, 1, &elu_alpha));
+        } else if (attributes.mode == PointwiseMode_t::SOFTPLUS_FWD ||
+                   attributes.mode == PointwiseMode_t::SOFTPLUS_BWD) {
+            double softplus_beta = attributes.softplus_beta.value_or(1.0);
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA,
+                                                           CUDNN_TYPE_DOUBLE,
+                                                           1,
+                                                           &softplus_beta));
+        } else if (attributes.mode == PointwiseMode_t::SWISH_FWD || attributes.mode == PointwiseMode_t::SWISH_BWD) {
+            double swish_beta = attributes.swish_beta.value_or(1.0);
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_POINTWISE_SWISH_BETA,
+                                                           CUDNN_TYPE_DOUBLE,
+                                                           1,
+                                                           &swish_beta));
+        } else if (attributes.mode == PointwiseMode_t::GEN_INDEX) {
+            int64_t axis = attributes.get_axis().value_or(-1);
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+                pointwise_descriptor.get_raw_desc(), CUDNN_ATTR_POINTWISE_AXIS, CUDNN_TYPE_INT64, 1, &axis));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(pointwise_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(pointwise_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 pointwise_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            pointwise_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR));
+
+        // Set the pointwise descriptor
+        auto pw_desc_ptr = pointwise_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &pw_desc_ptr));
+
+        auto const port_count        = get_pointwise_mode_port_count(attributes.mode);
+        bool const is_activation_bwd = detail::is_activation_backward_mode(attributes.mode);
+
+        if (is_activation_bwd) {
+            // Backward mode: IN_0 is dy, IN_1 is x, OUT_0 is dx
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0);
+            auto dy_desc = tensors.at(IN_0->second->get_uid())->get_raw_desc();
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1);
+            auto x_desc = tensors.at(IN_1->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_POINTWISE_XDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &x_desc));
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_POINTWISE_DYDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &dy_desc));
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0);
+            auto dx_desc = tensors.at(OUT_0->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_POINTWISE_DXDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &dx_desc));
+        } else {
+            // Forward mode
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_0, Pointwise_attributes::input_names::IN_0);
+            auto x_desc = tensors.at(IN_0->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_POINTWISE_XDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &x_desc));
+
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(OUT_0, Pointwise_attributes::output_names::OUT_0);
+            auto y_desc = tensors.at(OUT_0->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_POINTWISE_YDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &y_desc));
+
+            if (port_count >= 3) {
+                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_1, Pointwise_attributes::input_names::IN_1);
+                auto b_desc = tensors.at(IN_1->second->get_uid())->get_raw_desc();
+
+                _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                               CUDNN_ATTR_OPERATION_POINTWISE_BDESC,
+                                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                               1,
+                                                               &b_desc));
+            }
+
+            if (port_count >= 4) {
+                CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(IN_2, Pointwise_attributes::input_names::IN_2);
+                auto t_desc = tensors.at(IN_2->second->get_uid())->get_raw_desc();
+
+                _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(pointwise_operation.get_raw_desc(),
+                                                               CUDNN_ATTR_OPERATION_POINTWISE_TDESC,
+                                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                               1,
+                                                               &t_desc));
+            }
+        }
+
+        // Set alpha scaling factors (always set to 1.0)
+        float alpha1 = 1.0f;
+        float alpha2 = 1.0f;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            pointwise_operation.get_raw_desc(), CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1, CUDNN_TYPE_FLOAT, 1, &alpha1));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            pointwise_operation.get_raw_desc(), CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2, CUDNN_TYPE_FLOAT, 1, &alpha2));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(pointwise_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(pointwise_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "POINTWISE"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::pointwise(std::shared_ptr<Tensor_attributes> a,
+                 Pointwise_attributes attributes,
+                 std::shared_ptr<Tensor_attributes> c) {
+    attributes.inputs[Pointwise_attributes::input_names::IN_0]    = a;
+    attributes.outputs[Pointwise_attributes::output_names::OUT_0] = c;
+    sub_nodes.emplace_back(std::make_unique<PointwiseNode>(std::move(attributes), context));
+}
+
+inline void
+INode::pointwise(std::shared_ptr<Tensor_attributes> a,
+                 std::shared_ptr<Tensor_attributes> b,
+                 Pointwise_attributes attributes,
+                 std::shared_ptr<Tensor_attributes> c) {
+    attributes.inputs[Pointwise_attributes::input_names::IN_0]    = a;
+    attributes.inputs[Pointwise_attributes::input_names::IN_1]    = b;
+    attributes.outputs[Pointwise_attributes::output_names::OUT_0] = c;
+    sub_nodes.emplace_back(std::make_unique<PointwiseNode>(std::move(attributes), context));
+}
+
+inline std::shared_ptr<Tensor_attributes>
+INode::pointwise(std::shared_ptr<Tensor_attributes> a, Pointwise_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    attributes.inputs[Pointwise_attributes::input_names::IN_0] = a;
+    if (a->get_name().empty()) {
+        a->set_name(attributes.name + "::IN_0");
+    };
+    auto OUT_0 = attributes.outputs[Pointwise_attributes::output_names::OUT_0] =
+        output_tensor(attributes.name + "::OUT_0");
+
+    sub_nodes.emplace_back(std::make_unique<PointwiseNode>(std::move(attributes), context));
+    return OUT_0;
+}
+
+inline std::shared_ptr<Tensor_attributes>
+INode::pointwise(std::shared_ptr<Tensor_attributes> a,
+                 std::shared_ptr<Tensor_attributes> b,
+                 Pointwise_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    attributes.inputs[Pointwise_attributes::input_names::IN_0] = a;
+    attributes.inputs[Pointwise_attributes::input_names::IN_1] = b;
+    if (a->get_name().empty()) {
+        a->set_name(attributes.name + "::IN_0");
+    };
+    if (b->get_name().empty()) {
+        b->set_name(attributes.name + "::IN_1");
+    };
+    auto OUT_0 = attributes.outputs[Pointwise_attributes::output_names::OUT_0] =
+        output_tensor(attributes.name + "::OUT_0");
+
+    sub_nodes.emplace_back(std::make_unique<PointwiseNode>(std::move(attributes), context));
+    return OUT_0;
+}
+
+inline std::shared_ptr<Tensor_attributes>
+INode::pointwise(std::shared_ptr<Tensor_attributes> a,
+                 std::shared_ptr<Tensor_attributes> b,
+                 std::shared_ptr<Tensor_attributes> c,
+                 Pointwise_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    attributes.inputs[Pointwise_attributes::input_names::IN_0] = a;
+    attributes.inputs[Pointwise_attributes::input_names::IN_1] = b;
+    attributes.inputs[Pointwise_attributes::input_names::IN_2] = c;
+    if (a->get_name().empty()) {
+        a->set_name(attributes.name + "::IN_0");
+    };
+    if (b->get_name().empty()) {
+        b->set_name(attributes.name + "::IN_1");
+    };
+    if (c->get_name().empty()) {
+        c->set_name(attributes.name + "::IN_2");
+    };
+    auto OUT_0 = attributes.outputs[Pointwise_attributes::output_names::OUT_0] =
+        output_tensor(attributes.name + "::OUT_0");
+
+    sub_nodes.emplace_back(std::make_unique<PointwiseNode>(std::move(attributes), context));
+    return OUT_0;
+}
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/reduction.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/reduction.h
new file mode 100644
index 00000000..193b9d37
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/reduction.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class ReductionNode : public NodeCRTP<ReductionNode> {
+   public:
+    Reduction_attributes attributes;
+
+    ReductionNode(Reduction_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::REDUCTION;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating ReductionNode " << attributes.name);
+
+        if (attributes.get_is_deterministic() && detail::get_backend_version() < 91100) {
+            return {error_code_t::GRAPH_NOT_SUPPORTED, "DETERMINISTIC mode is not supported in cudnn version < 9.11.0"};
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for reduction node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // Only inferrencing from IN_0 to OUT_0 works today.
+        auto x_tensor = attributes.inputs[Reduction_attributes::input_names::X];
+        auto y_tensor = attributes.outputs[Reduction_attributes::output_names::Y];
+
+        auto const& x_tensor_dim = x_tensor->get_dim();
+        auto y_tensor_dim        = y_tensor->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (y_tensor_dim.empty()) {
+            y_tensor->set_dim(x_tensor_dim);
+        }
+        if (y_tensor->get_stride().empty()) {
+            auto const& y_dim = y_tensor->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(y_dim.size());
+            y_tensor->set_stride(detail::generate_stride(y_dim, stride_order));
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building ReductionNode operations " << attributes.name << " ");
+
+        // Create reduction descriptor by directly calling cuDNN backend API
+        ReductionDesc_v8 reduction_descriptor;
+
+        // 1. Create the backend descriptor
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            reduction_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_REDUCTION_DESCRIPTOR));
+
+        // 2. Set compute type attribute
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reduction_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_REDUCTION_COMP_TYPE,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        // 3. Set reduction operator attribute
+        cudnnReduceTensorOp_t cudnn_reduction_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.get_mode().value(), cudnn_reduction_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reduction_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_REDUCTION_OPERATOR,
+                                                       CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+                                                       1,
+                                                       &cudnn_reduction_mode));
+
+        // 4. Set deterministic mode if supported
+#if (CUDNN_VERSION >= 91100)
+        if (detail::get_backend_version() >= 91100) {
+            bool is_deterministic = attributes.get_is_deterministic();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reduction_descriptor.get_raw_desc(),
+                                                           CUDNN_ATTR_REDUCTION_IS_DETERMINISTIC,
+                                                           CUDNN_TYPE_BOOLEAN,
+                                                           1,
+                                                           &is_deterministic));
+        }
+#endif
+
+        // 5. Finalize the descriptor
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(reduction_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(reduction_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 reduction_operation;
+
+        // Validate input tensors are set
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reduction_attributes::input_names::X);
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reduction_attributes::output_names::Y);
+
+        // 1. Create the backend operation descriptor
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            reduction_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR));
+
+        // 2. Set the reduction descriptor attribute
+        auto reduction_desc_ptr = reduction_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reduction_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_REDUCTION_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &reduction_desc_ptr));
+
+        // 3. Set the input tensor (X) descriptor attribute
+        auto x_backend_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reduction_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_backend_desc));
+
+        // 4. Set the output tensor (Y) descriptor attribute
+        auto y_backend_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reduction_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_backend_desc));
+
+        // 5. Finalize the operation descriptor
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(reduction_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(reduction_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "REDUCTION"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::reduction(std::shared_ptr<Tensor_attributes> a,
+                 Reduction_attributes attributes,
+                 std::shared_ptr<Tensor_attributes> c) {
+    attributes.inputs[Reduction_attributes::input_names::X]   = a;
+    attributes.outputs[Reduction_attributes::output_names::Y] = c;
+    sub_nodes.emplace_back(std::make_unique<ReductionNode>(std::move(attributes), context));
+}
+
+inline std::shared_ptr<Tensor_attributes>
+INode::reduction(std::shared_ptr<Tensor_attributes> input, Reduction_attributes attributes) {
+    attributes.inputs[Reduction_attributes::input_names::X] = input;
+    auto Y = attributes.outputs[Reduction_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+
+    sub_nodes.emplace_back(std::make_unique<ReductionNode>(std::move(attributes), context));
+    return Y;
+}
+}  // namespace cudnn_frontend::graph
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/resample.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/resample.h
new file mode 100644
index 00000000..34e6031f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/resample.h
@@ -0,0 +1,291 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class ResampleNode : public NodeCRTP<ResampleNode> {
+   public:
+    Resample_attributes attributes;
+
+    ResampleNode(Resample_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::RESAMPLE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: " << "Validating ResampleNode " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.generate_index.has_value() == false,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "generate_index attribute not set");
+
+        if (attributes.generate_index.value() == true && attributes.resample_mode == ResampleMode_t::MAXPOOL) {
+            CUDNN_FE_VALIDATE_OUTPUT_TENSOR(Resample_attributes::output_names::Index);
+        }
+
+        // Make sure that the mode can be lowered to BE
+        cudnnResampleMode_t dummy;
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            detail::convert_to_cudnn_type(attributes.resample_mode, dummy) != CUDNN_STATUS_SUCCESS,
+            error_code_t::ATTRIBUTE_NOT_SET,
+            "Invalid resample mode.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for resample node " << attributes.name);
+
+        auto y_tensor = attributes.outputs[Resample_attributes::output_names::Y];
+        auto x_tensor = attributes.inputs[Resample_attributes::input_names::X];
+
+        attributes.fill_from_context(context);
+
+        // If user does not set shape and layout of the output tensor,
+        // Get it from node attributes
+        if (y_tensor->get_dim().empty()) {
+            auto const x_dim = x_tensor->get_dim();
+            auto y_dim       = y_tensor->get_dim();
+            y_dim            = x_dim;
+
+            // 2 cause first two dimensions are batch and channels
+            for (auto dim = 2u; dim < x_dim.size(); ++dim) {
+                auto spatial_dim = dim - 2u;
+                y_dim[dim] =
+                    1 + (x_dim[dim] + attributes.pre_padding[spatial_dim].numerator +
+                         attributes.post_padding[spatial_dim].numerator - attributes.window[spatial_dim].numerator) /
+                            attributes.stride[spatial_dim].numerator;
+            }
+
+            y_tensor->set_dim(y_dim);
+        }
+
+        // If layout is not set, generate the strides from layout
+        if (y_tensor->get_stride().empty()) {
+            auto const& y_dim = y_tensor->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(y_dim.size());
+            y_tensor->set_stride(detail::generate_stride(y_dim, stride_order));
+        }
+
+        if (attributes.outputs[Resample_attributes::output_names::Index]) {
+            auto index_tensor = attributes.outputs[Resample_attributes::output_names::Index];
+            index_tensor->set_dim(y_tensor->get_dim());
+
+            // If layout is not set, generate the strides from layout
+            if (index_tensor->get_stride().empty()) {
+                auto const& index_dim = index_tensor->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(index_dim.size());
+                index_tensor->set_stride(detail::generate_stride(index_dim, stride_order));
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building ResampleNode operations " << attributes.name << " ");
+
+        auto number_of_spatial_dim = static_cast<int64_t>(attributes.window.size());
+
+        // Create resample descriptor by directly calling cuDNN backend API
+        ResampleDesc_v8 resample_descriptor;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            resample_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_RESAMPLE_DESCRIPTOR));
+
+        // Set resample mode
+        cudnnResampleMode_t cudnn_resample_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.resample_mode, cudnn_resample_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_MODE,
+                                                       CUDNN_TYPE_RESAMPLE_MODE,
+                                                       1,
+                                                       &cudnn_resample_mode));
+
+        // Set compute type
+        cudnnDataType_t cudnn_data_type;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.compute_data_type, cudnn_data_type));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_COMP_TYPE,
+                                                       CUDNN_TYPE_DATA_TYPE,
+                                                       1,
+                                                       &cudnn_data_type));
+
+        // Set nan propagation
+        cudnnNanPropagation_t nan_opt = CUDNN_PROPAGATE_NAN;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION,
+                                                       CUDNN_TYPE_NAN_PROPOGATION,
+                                                       1,
+                                                       &nan_opt));
+
+        // Set padding mode
+        cudnnPaddingMode_t cudnn_padding_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.padding_mode, cudnn_padding_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_PADDING_MODE,
+                                                       CUDNN_TYPE_PADDING_MODE,
+                                                       1,
+                                                       &cudnn_padding_mode));
+
+        // Set spatial dimensions
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS,
+                                                       CUDNN_TYPE_INT64,
+                                                       1,
+                                                       &number_of_spatial_dim));
+
+        // Set window dimensions
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_WINDOW_DIMS,
+                                                       CUDNN_TYPE_FRACTION,
+                                                       number_of_spatial_dim,
+                                                       attributes.window.data()));
+
+        // Set pre padding
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_PRE_PADDINGS,
+                                                       CUDNN_TYPE_FRACTION,
+                                                       number_of_spatial_dim,
+                                                       attributes.pre_padding.data()));
+
+        // Set post padding
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_POST_PADDINGS,
+                                                       CUDNN_TYPE_FRACTION,
+                                                       number_of_spatial_dim,
+                                                       attributes.post_padding.data()));
+
+        // Set strides
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RESAMPLE_STRIDES,
+                                                       CUDNN_TYPE_FRACTION,
+                                                       number_of_spatial_dim,
+                                                       attributes.stride.data()));
+
+        // Finalize the descriptor
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(resample_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(resample_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 resample_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            resample_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Resample_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Resample_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        // Set alpha and beta
+        double alpha = 1.0;
+        double beta  = 0.0;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            resample_operation.get_raw_desc(), CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA, CUDNN_TYPE_DOUBLE, 1, &alpha));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            resample_operation.get_raw_desc(), CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA, CUDNN_TYPE_DOUBLE, 1, &beta));
+
+        // Set resample descriptor
+        auto resample_raw_desc = resample_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &resample_raw_desc));
+
+        // Set index tensor if available
+        auto index = attributes.outputs.find(Resample_attributes::output_names::Index);
+        if ((index != attributes.outputs.end()) && (index->second != nullptr)) {
+            auto idx_desc = tensors.at(index->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(resample_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &idx_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(resample_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(resample_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "RESAMPLE"})"_json);
+    }
+#endif
+};
+
+inline std::array<std::shared_ptr<Tensor_attributes>, 2>
+INode::resample(std::shared_ptr<Tensor_attributes> input, Resample_attributes attributes) {
+    if (attributes.name.empty()) {
+        attributes.name += std::to_string(sub_nodes.size());
+    }
+    attributes.inputs[Resample_attributes::input_names::X] = input;
+    auto Y = attributes.outputs[Resample_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+    std::shared_ptr<Tensor_attributes> Index                          = nullptr;
+    if (attributes.generate_index.has_value() && attributes.generate_index.value() == true &&
+        attributes.resample_mode == ResampleMode_t::MAXPOOL) {
+        Index = attributes.outputs[Resample_attributes::output_names::Index] =
+            output_tensor(attributes.name + "::Index");
+    }
+
+    sub_nodes.emplace_back(std::make_unique<ResampleNode>(std::move(attributes), context));
+    return {Y, Index};
+}
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/reshape.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/reshape.h
new file mode 100644
index 00000000..39b08c79
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/reshape.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class ReshapeNode : public NodeCRTP<ReshapeNode> {
+   public:
+    Reshape_attributes attributes;
+
+    ReshapeNode(Reshape_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::RESHAPE;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for reshape node " << attributes.name);
+
+        auto y_tensor = attributes.outputs[Reshape_attributes::output_names::Y];
+
+        attributes.fill_from_context(context);
+
+        // If user does not set shape and layout of the output tensor,
+        // Get it from node attributes
+        // If layout is not set, generate the strides from layout
+
+        if (y_tensor->get_dim().empty() && attributes.get_dim().size()) {
+            y_tensor->set_dim(attributes.dim);
+        }
+
+        if (y_tensor->get_stride().empty()) {
+            if (attributes.get_stride().size()) {
+                y_tensor->set_stride(attributes.get_stride());
+            } else {
+                auto const& y_dim = y_tensor->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(y_dim.size());
+                y_tensor->set_stride(detail::generate_stride(y_dim, stride_order));
+            }
+        }
+
+        if (y_tensor->get_dim().empty() || y_tensor->get_stride().empty()) {
+            return {error_code_t::SHAPE_DEDUCTION_FAILED, "Reshape node output shape deduction failed"};
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building ReshapeNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 reshape_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            reshape_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Reshape_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reshape_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_RESHAPE_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Reshape_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(reshape_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_RESHAPE_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(reshape_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(reshape_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "RESHAPE"})"_json);
+    }
+#endif
+};
+
+inline std::shared_ptr<Tensor_attributes>
+INode::reshape(std::shared_ptr<Tensor_attributes> input, Reshape_attributes attributes) {
+    attributes.inputs[Reshape_attributes::input_names::X] = input;
+    auto Y = attributes.outputs[Reshape_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+
+    sub_nodes.emplace_back(std::make_unique<ReshapeNode>(std::move(attributes), context));
+    return Y;
+}
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/rmsnorm.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/rmsnorm.h
new file mode 100644
index 00000000..bc1f37d2
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/rmsnorm.h
@@ -0,0 +1,406 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+class RMSNormNode : public NodeCRTP<RMSNormNode> {
+   public:
+    Rmsnorm_attributes attributes;
+
+    RMSNormNode(Rmsnorm_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::RMSNORM;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for rmsnorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        auto X = attributes.inputs[Rmsnorm_attributes::input_names::X];
+        auto Y = attributes.outputs[Rmsnorm_attributes::output_names::Y];
+
+        // Only infer dims and strides if user did not set them
+        if (Y->get_dim().empty()) {
+            Y->set_dim(X->get_dim());
+        }
+        if (Y->get_stride().empty()) {
+            Y->set_stride(X->get_stride());
+        }
+
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            auto inv_var = attributes.outputs[Rmsnorm_attributes::output_names::INV_VARIANCE];
+            // Only infer dims and strides if user did not set them
+            if (inv_var->get_dim().empty()) {
+                auto inv_var_dim = X->get_dim();
+                auto scale       = attributes.inputs[Rmsnorm_attributes::input_names::SCALE];
+                if (scale->get_dim().empty()) {
+                    // mean inv_var dim is n,1,1,1
+                    for (size_t i = 1; i < inv_var_dim.size(); i++) {
+                        inv_var_dim[i] = 1;
+                    }
+                } else {
+                    for (size_t i = 0; i < inv_var_dim.size(); i++) {
+                        if (scale->get_dim()[i] != 1) {
+                            inv_var_dim[i] = 1;
+                        }
+                    }
+                }
+                inv_var->set_dim(inv_var_dim);
+            }
+            if (inv_var->get_stride().empty()) {
+                auto const& inv_var_dim = inv_var->get_dim();
+                std::vector<int64_t> stride_order;
+                CHECK_CUDNN_FRONTEND_ERROR(
+                    detail::generate_stride_order_preserving_format(X->get_stride(), inv_var_dim.size(), stride_order));
+                inv_var->set_stride(detail::generate_stride(inv_var_dim, stride_order));
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating RMSNormNode " << attributes.name);
+
+        // Norm forward phase should be set
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.forward_phase == NormFwdPhase_t::NOT_SET,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Forward phase not set of rmsnorm node.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building RMSNormNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 rmsnorm_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            rmsnorm_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR));
+
+        // Set norm mode to RMS_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::RMS_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set forward phase
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.forward_phase, cudnn_norm_fwd_phase));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                                       1,
+                                                       &cudnn_norm_fwd_phase));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set scale tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set epsilon tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Rmsnorm_attributes::input_names::EPSILON);
+        auto epsilon_desc = tensors.at(EPSILON->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &epsilon_desc));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rmsnorm_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &y_desc));
+
+        // Set inv_variance for training phase
+        if (attributes.forward_phase == NormFwdPhase_t::TRAINING) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(INV_VARIANCE, Rmsnorm_attributes::output_names::INV_VARIANCE);
+            auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &inv_var_desc));
+        }
+
+        // Set optional bias tensor
+        auto BIAS = attributes.inputs.find(Rmsnorm_attributes::input_names::BIAS);
+        if ((BIAS != attributes.inputs.end()) && (BIAS->second != nullptr)) {
+            auto bias_desc = tensors.at(BIAS->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rmsnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &bias_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(rmsnorm_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(rmsnorm_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "RMS_NORM"})"_json);
+    }
+#endif
+};
+
+class DRMSNormNode : public NodeCRTP<DRMSNormNode> {
+   public:
+    Rmsnorm_backward_attributes attributes;
+
+    DRMSNormNode(Rmsnorm_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::DRMSNorm;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating DRMSNormNode node " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.use_dbias.has_value() == false,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "DRMSNormNode node needs has_bias(bool) to be called.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Inferencing properties for DRMSNorm node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // TODO: Only inferencing from X works today.
+        auto X                  = attributes.inputs[Rmsnorm_backward_attributes::input_names::X];
+        auto const x_tensor_dim = X->get_dim();
+
+        auto DY            = attributes.inputs[Rmsnorm_backward_attributes::input_names::DY];
+        auto dy_tensor_dim = DY->get_dim();
+
+        // Only infer dims and strides if user did not set them
+        if (dy_tensor_dim.empty()) {
+            dy_tensor_dim.resize(x_tensor_dim.size());
+            DY->set_dim(x_tensor_dim);
+        }
+        if (DY->get_stride().empty()) {
+            auto const& DY_dim = DY->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DY_dim.size());
+            DY->set_stride(detail::generate_stride(DY_dim, stride_order));
+        }
+
+        auto DX            = attributes.outputs[Rmsnorm_backward_attributes::output_names::DX];
+        auto dx_tensor_dim = DX->get_dim();
+        // Only infer dims and strides if user did not set them
+        if (dx_tensor_dim.empty()) {
+            dx_tensor_dim.resize(x_tensor_dim.size());
+            DX->set_dim(x_tensor_dim);
+        }
+        if (DX->get_stride().empty()) {
+            auto const& DX_dim = DX->get_dim();
+            // Default to NHWC
+            auto const& stride_order = detail::generate_NHWC_stride_order(DX_dim.size());
+            DX->set_stride(detail::generate_stride(DX_dim, stride_order));
+        }
+
+        auto scale_bias_dim = X->get_dim();
+        scale_bias_dim[0]   = 1;
+
+        // Set channel length tensors
+        auto infer_scale_bias_tensors = [&scale_bias_dim](std::shared_ptr<Tensor_attributes>& T) {
+            auto tensor_dim = T->get_dim();
+            // Only infer dims and strides if user did not set them
+            if (tensor_dim.empty()) {
+                T->set_dim(scale_bias_dim);
+            }
+            if (T->get_stride().empty()) {
+                auto const& T_dim = T->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(T_dim.size());
+                T->set_stride(detail::generate_stride(T_dim, stride_order));
+            }
+        };
+
+        infer_scale_bias_tensors(attributes.outputs[Rmsnorm_backward_attributes::output_names::DSCALE]);
+        if (attributes.use_dbias.value()) {
+            infer_scale_bias_tensors(attributes.outputs[Rmsnorm_backward_attributes::output_names::DBIAS]);
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: Building DRMSNormNode operations " << attributes.name << " ");
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 drmsnorm_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            drmsnorm_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR));
+
+        // Set norm mode to RMS_NORM
+        cudnnBackendNormMode_t cudnn_norm_mode;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(NormMode_t::RMS_NORM, cudnn_norm_mode));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_MODE,
+                                                       CUDNN_TYPE_NORM_MODE,
+                                                       1,
+                                                       &cudnn_norm_mode));
+
+        // Set input tensor X
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(X, Rmsnorm_backward_attributes::input_names::X);
+        auto x_desc = tensors.at(X->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_XDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &x_desc));
+
+        // Set DY tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(DY, Rmsnorm_backward_attributes::input_names::DY);
+        auto dy_desc = tensors.at(DY->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dy_desc));
+
+        // Set scale tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(SCALE, Rmsnorm_backward_attributes::input_names::SCALE);
+        auto scale_desc = tensors.at(SCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &scale_desc));
+
+        // Set inv_variance tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(INV_VARIANCE, Rmsnorm_backward_attributes::input_names::INV_VARIANCE);
+        auto inv_var_desc = tensors.at(INV_VARIANCE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &inv_var_desc));
+
+        // Set DSCALE output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DSCALE, Rmsnorm_backward_attributes::output_names::DSCALE);
+        auto dscale_desc = tensors.at(DSCALE->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dscale_desc));
+
+        // Set optional DBIAS output tensor
+        if (attributes.use_dbias.value()) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DBIAS, Rmsnorm_backward_attributes::output_names::DBIAS);
+            auto dbias_desc = tensors.at(DBIAS->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &dbias_desc));
+        }
+
+        // Set DX output tensor
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Rmsnorm_backward_attributes::output_names::DX);
+        auto dx_desc = tensors.at(DX->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(drmsnorm_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &dx_desc));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(drmsnorm_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(drmsnorm_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "RMS_NORM_BPROP"})"_json);
+    }
+#endif
+};
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/rng.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/rng.h
new file mode 100644
index 00000000..09b762d3
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/rng.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+class RngNode : public NodeCRTP<RngNode> {
+   public:
+    Rng_attributes attributes;
+
+    RngNode(Rng_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::RNG;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for rng node " << attributes.name);
+
+        auto y_tensor = attributes.outputs[Rng_attributes::output_names::Y];
+
+        attributes.fill_from_context(context);
+
+        // If user does not set shape and layout of the generated tensor,
+        // Get it from node attributes
+        // If layout is not set, generate the strides from layout
+
+        if (y_tensor->get_dim().empty() && attributes.get_dim().size()) {
+            y_tensor->set_dim(attributes.dim);
+        }
+
+        if (y_tensor->get_stride().empty()) {
+            if (attributes.get_stride().size()) {
+                y_tensor->set_stride(attributes.get_stride());
+            } else {
+                auto const& y_dim = y_tensor->get_dim();
+                // Default to NHWC
+                auto const& stride_order = detail::generate_NHWC_stride_order(y_dim.size());
+                y_tensor->set_stride(detail::generate_stride(y_dim, stride_order));
+            }
+        }
+
+        if (y_tensor->get_dim().empty() || y_tensor->get_stride().empty()) {
+            return {error_code_t::SHAPE_DEDUCTION_FAILED, "RNG node output shape deduction failed"};
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building RngNode operations " << attributes.name << " ");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.get_distribution() != RngDistribution_t::BERNOULLI,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "no other distribution except bernoulli supported.");
+
+        // Create RNG descriptor by directly calling cuDNN backend API
+        RngDesc_v8 rng_descriptor;
+
+        _CUDNN_CHECK_CUDNN_ERROR(rng_descriptor.initialize_managed_backend_pointer(CUDNN_BACKEND_RNG_DESCRIPTOR));
+
+        // Set distribution type
+        cudnnRngDistribution_t cudnn_rng_distribution;
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::convert_to_cudnn_type(attributes.get_distribution(), cudnn_rng_distribution));
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rng_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RNG_DISTRIBUTION,
+                                                       CUDNN_TYPE_RNG_DISTRIBUTION,
+                                                       1,
+                                                       &cudnn_rng_distribution));
+
+        // Set Bernoulli distribution probability
+        double bernoulli_prob = attributes.get_bernoulli_probability().value();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rng_descriptor.get_raw_desc(),
+                                                       CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY,
+                                                       CUDNN_TYPE_DOUBLE,
+                                                       1,
+                                                       &bernoulli_prob));
+
+        // Finalize the descriptor
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(rng_descriptor.get_raw_desc()));
+        CUDNN_FE_LOG_LABEL_ENDL(rng_descriptor);
+
+        // Create operation by directly calling cuDNN backend API
+        Operation_v8 rng_operation;
+
+        _CUDNN_CHECK_CUDNN_ERROR(
+            rng_operation.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR));
+
+        // Set output tensor Y
+        CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(Y, Rng_attributes::output_names::Y);
+        auto y_desc = tensors.at(Y->second->get_uid())->get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+            rng_operation.get_raw_desc(), CUDNN_ATTR_OPERATION_RNG_YDESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &y_desc));
+
+        // Set RNG descriptor
+        auto rng_raw_desc = rng_descriptor.get_raw_desc();
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rng_operation.get_raw_desc(),
+                                                       CUDNN_ATTR_OPERATION_RNG_DESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &rng_raw_desc));
+
+        if (attributes.seed.has_value()) {
+            // Set seed as int64_t value
+            int64_t seed_value = attributes.get_seed().value();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(
+                rng_operation.get_raw_desc(), CUDNN_ATTR_OPERATION_RNG_SEED, CUDNN_TYPE_INT64, 1, &seed_value));
+        } else {
+            // Set seed tensor descriptor
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Seed, Rng_attributes::input_names::Seed);
+            auto seed_desc = tensors.at(Seed->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rng_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_RNG_SEED,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &seed_desc));
+
+            // Set offset tensor descriptor
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Offset, Rng_attributes::input_names::Offset);
+            auto offset_desc = tensors.at(Offset->second->get_uid())->get_raw_desc();
+
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(rng_operation.get_raw_desc(),
+                                                           CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &offset_desc));
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(rng_operation.get_raw_desc()));
+
+        operations.push_back(std::make_shared<Operation_v8>(std::move(rng_operation)));
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "RNG"})"_json);
+    }
+#endif
+};
+
+inline void
+INode::rng(std::shared_ptr<Tensor_attributes> seed,
+           std::shared_ptr<Tensor_attributes> offset,
+           Rng_attributes attributes,
+           std::shared_ptr<Tensor_attributes> y) {
+    attributes.inputs[Rng_attributes::input_names::Seed]   = seed;
+    attributes.inputs[Rng_attributes::input_names::Offset] = offset;
+    attributes.outputs[Rng_attributes::output_names::Y]    = y;
+    sub_nodes.emplace_back(std::make_unique<RngNode>(std::move(attributes), context));
+}
+
+inline std::shared_ptr<Tensor_attributes>
+INode::rng(std::shared_ptr<Tensor_attributes> seed,
+           std::shared_ptr<Tensor_attributes> offset,
+           Rng_attributes attributes) {
+    attributes.inputs[Rng_attributes::input_names::Seed]   = seed;
+    attributes.inputs[Rng_attributes::input_names::Offset] = offset;
+    auto Y = attributes.outputs[Rng_attributes::output_names::Y] = output_tensor(attributes.name + "::Y");
+
+    sub_nodes.emplace_back(std::make_unique<RngNode>(std::move(attributes), context));
+    return Y;
+}
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
new file mode 100644
index 00000000..55d635a4
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
@@ -0,0 +1,1964 @@
+#pragma once
+
+#include <cstdlib>
+
+#include "../../cudnn_frontend_Heuristics.h"
+#include "../../cudnn_frontend_Logging.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+#include "matmul.h"
+#include "pointwise.h"
+#include "rng.h"
+#include "softmax.h"
+#include "paged_cache_load.h"
+#include "sdpa_support_surface.h"
+
+namespace cudnn_frontend::graph {
+
+namespace attn::score_modifiers {
+
+// clang-format off
+inline float get_negative_inf_value();
+
+inline std::shared_ptr<Tensor_attributes> causal_mask(
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Tensor_attributes> attention_score
+);
+
+inline std::shared_ptr<Tensor_attributes> bias(
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Tensor_attributes> attention_score,
+    std::shared_ptr<Tensor_attributes> bias_tensor
+);
+
+inline std::shared_ptr<Tensor_attributes> causal_mask_bottom_right(
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Tensor_attributes> attention_score,
+    std::shared_ptr<Tensor_attributes> seq_len_q,
+    std::shared_ptr<Tensor_attributes> seq_len_kv
+);
+
+inline std::shared_ptr<Tensor_attributes> padding_mask(
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Tensor_attributes> attention_score,
+    std::shared_ptr<Tensor_attributes> seq_len_kv,
+    std::shared_ptr<Tensor_attributes> seq_len_q
+);
+
+inline std::shared_ptr<Tensor_attributes> sliding_window_mask(
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Tensor_attributes> attention_score,
+    DiagonalAlignment_t diagonal_alignment,
+    std::optional<int64_t> left_window,
+    std::optional<int64_t> right_window,
+    int64_t s_q,
+    int64_t s_kv,
+    std::shared_ptr<Tensor_attributes> s_q_ptr,
+    std::shared_ptr<Tensor_attributes> s_kv_ptr
+);
+
+inline std::shared_ptr<Tensor_attributes> alibi_mask(
+    std::shared_ptr<Graph> graph,
+    std::shared_ptr<Tensor_attributes> attention_score,
+    std::shared_ptr<Tensor_attributes>& alibi_slopes,
+    int64_t h_q,
+    int64_t& alibi_slopes_size
+);
+// clang-format on
+
+}  // namespace attn::score_modifiers
+
+template <typename DerivedT>
+class SDPANodeBase : public NodeCRTP<DerivedT> {
+   protected:
+    using input_names  = SDPA_attributes::input_names;
+    using output_names = SDPA_attributes::output_names;
+
+    std::shared_ptr<Tensor_attributes> rng_output;
+    std::shared_ptr<Tensor_attributes> alibi_slopes;
+    int64_t alibi_slopes_size = 0;
+
+   public:
+    SDPA_attributes attributes;
+
+    SDPANodeBase(SDPA_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP<DerivedT>(context), attributes(std::move(attributes_)) {}
+
+    bool
+    is_paged_v() const {
+        auto page_table_v_it = attributes.inputs.find(input_names::Page_table_V);
+        return ((page_table_v_it) != attributes.inputs.end() && page_table_v_it->second != nullptr);
+    }
+
+    bool
+    is_paged_k() const {
+        auto page_table_k_it = attributes.inputs.find(input_names::Page_table_K);
+        return ((page_table_k_it) != attributes.inputs.end() && page_table_k_it->second != nullptr);
+    }
+
+    bool
+    has_seq_len_q() const {
+        auto seq_len_Q_it = attributes.inputs.find(SDPA_attributes::input_names::SEQ_LEN_Q);
+        return ((seq_len_Q_it) != attributes.inputs.end() && seq_len_Q_it->second != nullptr);
+    }
+
+    bool
+    has_seq_len_kv() const {
+        auto seq_len_KV_it = attributes.inputs.find(SDPA_attributes::input_names::SEQ_LEN_KV);
+        return ((seq_len_KV_it) != attributes.inputs.end() && seq_len_KV_it->second != nullptr);
+    }
+
+    // Helper function to infer KV sequence length
+    // Note that it cannot be run as part of infer_properties_node as
+    // this is being used in pre_validate_node
+    int64_t
+    infer_s_kv() const {
+        int64_t s_kv = -1;
+
+        auto get_input_dim = [this](const SDPA_attributes::input_names& input_name) {
+            auto const input_it = attributes.inputs.find(input_name);
+            if (input_it != attributes.inputs.end()) {
+                return input_it->second->get_dim();
+            } else {
+                return std::vector<int64_t>({-1, -1, -1, -1});
+            }
+        };
+
+        auto const& k_dim = get_input_dim(input_names::K);
+        auto const& v_dim = get_input_dim(input_names::V);
+
+        // If s_kv was set explicitly, use that
+        if (attributes.max_seq_len_kv.has_value()) {
+            s_kv = attributes.max_seq_len_kv.value();
+        }
+        // When one of K or V cache are paged, s_kv can be extracted directly
+        else if (!is_paged_k()) {
+            s_kv = k_dim[2];
+
+        } else if (!is_paged_v()) {
+            s_kv = v_dim[2];
+        } else {
+            CUDNN_FE_LOG_LABEL_ENDL(
+                "WARNING: maximum kv sequence length is being inferred. To set it explicitly, please use  "
+                "\"set_paged_attention_max_seq_len_kv\"");
+
+            auto bias_it = attributes.inputs.find(input_names::Bias);
+            auto rng_it  = attributes.outputs.find(output_names::RNG_DUMP);
+
+            // If there is a bias, extract it from there
+            if (bias_it != attributes.inputs.end() && bias_it->second != nullptr) {
+                s_kv = get_input_dim(input_names::Bias)[3];
+                // If there is an rng_dump output, extract it from there
+            } else if (rng_it != attributes.outputs.end() && rng_it->second != nullptr) {
+                s_kv = rng_it->second->get_dim()[3];
+                // When both caches are paged, and the above failed, we need to infer s_kv from the page table and
+                // container
+            } else {
+                // [b, 1, ceil(s_kv/block_size), 1]
+                auto page_table_dim_k = get_input_dim(input_names::Page_table_K);
+                // [b, h_k, block_size, d_k]
+                auto const container_dim_k = get_input_dim(input_names::K);
+                int64_t s_k                = page_table_dim_k[2] * container_dim_k[2];
+
+                // [b, 1, ceil(s_kv/block_size), 1]
+                auto page_table_dim_v = get_input_dim(input_names::Page_table_V);
+                // [b, h_v, block_size, d_v]
+                auto const container_dim_v = get_input_dim(input_names::V);
+                int64_t s_v                = page_table_dim_v[2] * container_dim_v[2];
+
+                s_kv = std::min(s_k, s_v);
+            }
+        }
+
+        return s_kv;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating SDPANode " << attributes.name);
+
+        // check that Q, K, V, O tensors has been assigned
+        // check that dim and strides has been assigned and last stride is 1
+#define CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(port, port_map)                                                       \
+    {                                                                                                           \
+        std::shared_ptr<Tensor_attributes> tensor_ptr = port_map.at(port);                                      \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(tensor_ptr->get_dim().size() != 4,                                       \
+                                       error_code_t::ATTRIBUTE_NOT_SET,                                         \
+                                       "The dim for " + std::string(#port) + " is invalid");                    \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(tensor_ptr->get_stride().size() != 4,                                    \
+                                       error_code_t::ATTRIBUTE_NOT_SET,                                         \
+                                       "The stride for " + std::string(#port) + " is invalid");                 \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(                                                                         \
+            tensor_ptr->get_stride()[3] != 1,                                                                   \
+            error_code_t::GRAPH_NOT_SUPPORTED,                                                                  \
+            "The stride for the last dimension corresponding to the embedding size per head should be 1 for " + \
+                std::string(#port));                                                                            \
+    }
+
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::Q, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::K, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::V, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::O, attributes.outputs);
+
+        if (attributes.generate_stats.value_or(false) == true) {
+            CUDNN_FE_VALIDATE_OUTPUT_TENSOR(output_names::Stats);
+        }
+
+        // If max is requested, validate that the output tensor is present
+        if (attributes.outputs.find(output_names::Max) != attributes.outputs.end() &&
+            attributes.outputs.at(output_names::Max) != nullptr) {
+            CUDNN_FE_VALIDATE_OUTPUT_TENSOR(output_names::Max);
+        }
+
+        // If sum_exp is requested, validate that the output tensor is present
+        if (attributes.outputs.find(output_names::Sum_exp) != attributes.outputs.end() &&
+            attributes.outputs.at(output_names::Sum_exp) != nullptr) {
+            CUDNN_FE_VALIDATE_OUTPUT_TENSOR(output_names::Sum_exp);
+        }
+
+#undef CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE
+
+        // validate backend limitations for the operation
+        auto validation_result =
+            attributes.validate_sdpa_support_surface(this->context, infer_s_kv(), is_paged_k(), is_paged_v());
+        if (validation_result.is_good() == false) {
+            return validation_result;
+        }
+
+        // return NOT_SET if sink_token present with 9.12 and below
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 91300 &&
+                                           attributes.inputs.find(input_names::SINK_TOKEN) != attributes.inputs.end(),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "SDPA with sink_token is not supported before 9.13.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        if (attributes.generate_stats.value_or(false)) {
+            auto stats     = attributes.outputs.at(output_names::Stats);
+            auto stats_dim = stats->get_dim();
+
+            if (stats_dim.empty()) {
+                // Fill properties of virtual tensors
+                auto const& p_dim = attributes.inputs[input_names::Q]->get_dim();
+                auto b            = p_dim[0];
+                auto h            = p_dim[1];
+                auto s_q          = p_dim[2];
+                stats->set_dim({b, h, s_q, 1}).set_stride({h * s_q, s_q, 1, 1});
+            }
+        }
+
+        if (attributes.outputs[output_names::Max] != nullptr) {
+            auto max = attributes.outputs.at(output_names::Max);
+
+            if (max->get_dim().empty()) {
+                // Fill properties of virtual tensors
+                auto const& p_dim = attributes.inputs[input_names::Q]->get_dim();
+                auto b            = p_dim[0];
+                auto h            = p_dim[1];
+                auto s_q          = p_dim[2];
+                max->set_dim({b, h, s_q, 1}).set_stride({h * s_q, s_q, 1, 1});
+            }
+        }
+
+        if (attributes.outputs[output_names::Sum_exp] != nullptr) {
+            auto sum_exp = attributes.outputs.at(output_names::Sum_exp);
+
+            if (sum_exp->get_dim().empty()) {
+                // Fill properties of virtual tensors
+                auto const& p_dim = attributes.inputs[input_names::Q]->get_dim();
+                auto b            = p_dim[0];
+                auto h            = p_dim[1];
+                auto s_q          = p_dim[2];
+                sum_exp->set_dim({b, h, s_q, 1}).set_stride({h * s_q, s_q, 1, 1});
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    post_validate_node() const override final {
+#define CUDNN_FE_VALIDATE_STRIDE(port, port_map)                                                                \
+    {                                                                                                           \
+        auto const& t = port_map.find(port);                                                                    \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(                                                                         \
+            t->second->get_stride().back() != 1,                                                                \
+            error_code_t::GRAPH_NOT_SUPPORTED,                                                                  \
+            "The stride for the last dimension corresponding to the embedding size per head should be 1 for " + \
+                std::string(#port));                                                                            \
+    }
+
+        CUDNN_FE_VALIDATE_STRIDE(output_names::O, attributes.outputs);
+
+#undef CUDNN_FE_VALIDATE_STRIDE
+
+        return {error_code_t::OK, ""};
+    }
+
+    virtual int64_t
+    get_fe_workspace_size_node() const override final {
+        int64_t size = 0;
+
+        // align alibi slopes memory to 16 bytes
+        size += ((alibi_slopes_size + 15) / 16 * 16);
+
+        return size;
+    }
+
+    virtual error_t
+    collect_tensors_in_workspace_node(
+        std::unordered_map<Tensor_attributes::uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>&
+            workspace_modifications,
+        int64_t& offset) const override final {
+        if (attributes.alibi_mask) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q);
+            int64_t const h_q     = Q->second->get_dim()[1];
+            auto alibi_slopes_vec = detail::get_alibi_slope(h_q);
+            workspace_modifications.emplace(alibi_slopes->get_uid(), std::make_tuple(0, offset, alibi_slopes_vec));
+            int64_t alibi_slopes_size_padded = ((alibi_slopes_size + 15) / 16 * 16);
+            offset                           = offset + alibi_slopes_size_padded;
+        }
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "SDPA_FWD"})"_json);
+    }
+#endif
+};
+
+class CompositeSDPANode : public SDPANodeBase<CompositeSDPANode> {
+   public:
+    CompositeSDPANode(SDPA_attributes&& attributes_, detail::Context const& context)
+        : SDPANodeBase(std::move(attributes_), context) {}
+
+    Type
+    getType() override final {
+        return Type::COMPOSITE;
+    }
+
+    error_t
+    expand_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for CompositeSDPANode node " << attributes.name);
+
+        // DO NOT REMOVE
+        // input data type is needed for:
+        // - aType of bmm2
+        // - dropout scale in pre 8.9.3
+        attributes.fill_from_context(this->context);
+
+        // Gather dim to fill properties of virtual tensors
+        auto const& q_dim = attributes.inputs[input_names::Q]->get_dim();
+        auto b            = q_dim[0];
+        auto h_q          = q_dim[1];
+        auto s_q          = q_dim[2];
+        auto d_qk         = q_dim[3];
+        auto const& k_dim = attributes.inputs[input_names::K]->get_dim();
+        auto h_k          = k_dim[1];
+        auto const& v_dim = attributes.inputs[input_names::V]->get_dim();
+        auto h_v          = v_dim[1];
+        auto d_v          = v_dim[3];
+        // Infer s_kv
+        int64_t s_kv = infer_s_kv();
+
+        std::shared_ptr<Tensor_attributes> k_cache;
+        if (!is_paged_k()) {
+            // 1. map K->KT
+            // cuDNN frontend API attention requires Q, K, V where
+            // Q = {b, h_q, s_q, d_qk}
+            // K = {b, h_k, s_kv, d_qk}
+            // V = {b, h_v, s_kv, d_v}
+            // but cuDNN backend API attention requires Q, KT, V
+            // Q = {b, h_q, s_q, d_qk}
+            // KT = {b, h_k, d_qk, s_kv}
+            // V = {b, h_v, s_kv, d_v}
+            // So the code below maps the K->KT
+            std::vector<int64_t> temp_vec;
+
+            temp_vec = attributes.inputs[input_names::K]->get_dim();
+            std::swap(temp_vec[2], temp_vec[3]);
+            attributes.inputs[input_names::K]->set_dim(temp_vec);
+
+            temp_vec = attributes.inputs[input_names::K]->get_stride();
+            std::swap(temp_vec[2], temp_vec[3]);
+            attributes.inputs[input_names::K]->set_stride(temp_vec);
+
+            // 2. Set k_cache
+            k_cache = attributes.inputs[input_names::K];
+        } else {
+            // Create a paged cache load operation
+            auto paged_cache_load_attributes_k = PagedCacheLoad_attributes().set_name("paged_k_cache_operation");
+            // Need to create virtual tensor descriptor for yOut here as it cannot be inferred
+            // K-cache has BHDS layout
+            k_cache = std::make_shared<Tensor_attributes>();
+            k_cache->set_is_virtual(true);
+            k_cache->set_dim({b, h_k, d_qk, s_kv});
+            k_cache->set_stride({d_qk * s_kv * h_k, d_qk * s_kv, 1, d_qk});
+            k_cache->set_data_type(attributes.inputs[input_names::K]->get_data_type());
+            paged_cache_load(attributes.inputs[input_names::K],
+                             attributes.inputs[input_names::SEQ_LEN_KV],
+                             attributes.inputs[input_names::Page_table_K],
+                             paged_cache_load_attributes_k,
+                             k_cache);
+        }
+
+        // This tensor tracks the main chain of data flow
+        std::shared_ptr<Tensor_attributes> last_output;
+
+        //// Q * K
+        auto bmm1_attributes = Matmul_attributes()
+                                   .set_name("bmm1")
+                                   .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                                   .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV]);
+
+        if (attributes.padding_mask) {
+            bmm1_attributes.set_padding(0.0);
+        }
+
+        auto const& bmm1_output = matmul(attributes.inputs[input_names::Q], k_cache, bmm1_attributes);
+        // Setting dim and strides as pointwise op wont have knowledge of how to do it for mha.
+        bmm1_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
+        last_output = bmm1_output;
+
+        //// Optional Attn scale
+        // In case user provided a scalar value, do a fused scalar.
+        if (attributes.attn_scale_value.has_value()) {
+            attributes.inputs[input_names::Attn_scale] =
+                std::make_shared<Tensor_attributes>(attributes.attn_scale_value.value());
+        }
+
+        // If attn scale present, add a pointwise mul node
+        if (attributes.inputs[input_names::Attn_scale]) {
+            Pointwise_attributes scale_attributes;
+            scale_attributes.set_name("attn_scale").set_mode(PointwiseMode_t::MUL);
+            auto const& attn_scale_output =
+                pointwise(last_output, attributes.inputs[input_names::Attn_scale], scale_attributes);
+            last_output = attn_scale_output;
+        }
+
+        // Descale Q
+        if (attributes.inputs.find(input_names::Descale_Q) != attributes.inputs.end() &&
+            attributes.inputs.at(input_names::Descale_Q) != nullptr) {
+            auto descale_q_attributes = Pointwise_attributes().set_mode(PointwiseMode_t::MUL).set_name("descale_q");
+            last_output = pointwise(last_output, attributes.inputs.at(input_names::Descale_Q), descale_q_attributes);
+        }
+
+        // Descale K
+        if (attributes.inputs.find(input_names::Descale_K) != attributes.inputs.end() &&
+            attributes.inputs.at(input_names::Descale_K) != nullptr) {
+            auto descale_k_attributes = Pointwise_attributes().set_mode(PointwiseMode_t::MUL).set_name("descale_k");
+            last_output = pointwise(last_output, attributes.inputs.at(input_names::Descale_K), descale_k_attributes);
+        }
+
+        if (attributes.attention_score_modifier != nullptr) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = this->context;
+            last_output                  = attributes.attention_score_modifier(graph_, last_output);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // Optional bias
+        if (attributes.inputs.find(input_names::Bias) != attributes.inputs.end() &&
+            attributes.inputs[input_names::Bias]) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = this->context;
+            last_output = attn::score_modifiers::bias(graph_, last_output, attributes.inputs[input_names::Bias]);
+            sub_nodes.emplace_back(node_);
+        }
+
+        if (attributes.alibi_mask) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = this->context;
+            last_output = attn::score_modifiers::alibi_mask(graph_, last_output, alibi_slopes, h_q, alibi_slopes_size);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // There are two cases of applying padding mask
+        // 1. when actual seq_len is less than or equal to max_seq_len
+        if (attributes.padding_mask) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = this->context;
+            last_output                  = attn::score_modifiers::padding_mask(graph_,
+                                                              last_output,
+                                                              attributes.inputs[input_names::SEQ_LEN_KV],
+                                                              attributes.inputs[input_names::SEQ_LEN_Q]);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // 2. (bug in cudnn backend) no padding with max_seq_len%64!=0
+        if ((s_kv % 64 != 0) && (!(attributes.padding_mask)) && (detail::get_backend_version() < 90000)) {
+            auto col_index_attributes =
+                Pointwise_attributes().set_name("gen_col_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(3);
+            auto col_index_output = pointwise(last_output, col_index_attributes);
+            // scalar seq_kv only needs to be passed in case there in no padding mask and seq_kv is not multiple of 64.
+            // Also future versions of cudnn will not need it, hence tensor is pre-fixed with WAR.
+            auto WAR_scalar_max_seq_kv = std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_kv));
+
+            auto col_less_seq_kv_attributes =
+                Pointwise_attributes().set_name("col_less_seq_kv").set_mode(PointwiseMode_t::CMP_LT);
+            auto col_less_seq_kv_output =
+                pointwise(col_index_output, WAR_scalar_max_seq_kv, col_less_seq_kv_attributes);
+
+            // Lower attributes to binary select attributes
+            auto negative_inf_padding =
+                std::make_shared<Tensor_attributes>(attn::score_modifiers::get_negative_inf_value());
+            auto binary_select_attributes =
+                Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT);
+            auto padding_mask_output =
+                pointwise(last_output, negative_inf_padding, col_less_seq_kv_output, binary_select_attributes);
+            last_output = padding_mask_output;
+        }
+
+        // Apply (bottom-right) causal masking (with right bound) and/or set the left bound
+        if (attributes.left_bound.has_value() || attributes.right_bound.has_value()) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = this->context;
+
+            auto s_kv_ptr = attributes.inputs.find(input_names::SEQ_LEN_KV) != attributes.inputs.end()
+                                ? attributes.inputs[input_names::SEQ_LEN_KV]
+                                : nullptr;
+            auto s_q_ptr  = attributes.inputs.find(input_names::SEQ_LEN_Q) != attributes.inputs.end()
+                                ? attributes.inputs[input_names::SEQ_LEN_Q]
+                                : nullptr;
+
+            last_output = attn::score_modifiers::sliding_window_mask(graph_,
+                                                                     last_output,
+                                                                     attributes.diagonal_alignment,
+                                                                     attributes.left_bound,
+                                                                     attributes.right_bound,
+                                                                     s_q,
+                                                                     s_kv,
+                                                                     s_q_ptr,
+                                                                     s_kv_ptr);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // Lower attributes to softmax attributes
+        auto softmax_output = std::make_shared<Tensor_attributes>();
+        softmax_output->set_is_virtual(true);
+
+        auto softmax_attributes = Softmax_attributes().set_name("softmax");
+        // Set sink for softmax if user has provided a sink tensor
+        if (attributes.inputs.find(input_names::SINK_TOKEN) != attributes.inputs.end()) {
+            softmax_attributes.set_sink(attributes.inputs[input_names::SINK_TOKEN]);
+        }
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        softmax(last_output,
+                softmax_attributes,
+                softmax_output,
+                attributes.outputs[output_names::Stats],
+                attributes.outputs[output_names::Max],
+                attributes.outputs[output_names::Sum_exp]);
+        last_output = softmax_output;
+
+        // Two cases for training: dropout present or not
+        bool dropout_present         = false;
+        auto const& dropout_mask     = attributes.inputs.find(input_names::Dropout_mask);
+        bool const is_dropout_custom = (dropout_mask != attributes.inputs.end()) && (dropout_mask->second != nullptr);
+        if (attributes.dropout_probability.has_value()) {
+            dropout_present = true;
+            // Special case: Skip dropout when 0.0 probability. Only do for 8.9.3 and up as rng isn't optional earlier.
+            if (detail::get_backend_version() > 8902 && attributes.dropout_probability.value() == 0.0) {
+                dropout_present = false;
+            }
+        } else if (is_dropout_custom) {
+            dropout_present = true;
+        }
+
+        if (dropout_present) {
+            if (is_dropout_custom) {
+                auto dropout_scale_attributes =
+                    Pointwise_attributes().set_name("dropout_scale_mul").set_mode(PointwiseMode_t::MUL);
+                auto const& dropout_scale_output =
+                    pointwise(last_output, attributes.inputs[input_names::Dropout_scale], dropout_scale_attributes);
+
+                auto mask_attributes =
+                    Pointwise_attributes().set_name("dropout_mask_mul").set_mode(PointwiseMode_t::MUL);
+                auto const& dropout_mask_output =
+                    pointwise(dropout_scale_output, dropout_mask->second, mask_attributes);
+                last_output = dropout_mask_output;
+            } else {
+                if (attributes.outputs[output_names::RNG_DUMP] != nullptr) {
+                    rng_output = attributes.outputs[output_names::RNG_DUMP];
+                    rng(attributes.inputs[input_names::Seed],
+                        attributes.inputs[input_names::Offset],
+                        Rng_attributes()
+                            .set_name("rng")
+                            .set_distribution(RngDistribution_t::BERNOULLI)
+                            .set_bernoulli_probability(1.0 - attributes.dropout_probability.value()),
+                        rng_output);
+                } else {
+                    rng_output = rng(attributes.inputs[input_names::Seed],
+                                     attributes.inputs[input_names::Offset],
+                                     Rng_attributes()
+                                         .set_name("rng")
+                                         .set_distribution(RngDistribution_t::BERNOULLI)
+                                         .set_bernoulli_probability(1.0 - attributes.dropout_probability.value()));
+                    rng_output
+                        // Hard coding dim and strides as rng output can no inputs to infer it from.
+                        ->set_dim({b, h_q, s_q, s_kv})
+                        .set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
+                }
+
+                auto mask_attributes =
+                    Pointwise_attributes().set_name("dropout_mask_mul").set_mode(PointwiseMode_t::MUL);
+                auto const& dropout_mask_output = pointwise(last_output, rng_output, mask_attributes);
+                last_output                     = dropout_mask_output;
+
+                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> dropout_scale = nullptr;
+
+                if (detail::get_backend_version() < 8903) {
+                    half dropout_scale_value = __float2half(1.0f / (1.0f - attributes.dropout_probability.value()));
+                    dropout_scale            = std::make_shared<Tensor_attributes>(dropout_scale_value);
+                } else {
+                    float dropout_scale_value = (1.0f / (1.0f - attributes.dropout_probability.value()));
+                    dropout_scale             = std::make_shared<Tensor_attributes>(dropout_scale_value);
+                }
+
+                auto dropout_scale_attributes =
+                    Pointwise_attributes().set_name("dropout_scale").set_mode(PointwiseMode_t::MUL);
+                auto const& dropout_scale_output = pointwise(last_output, dropout_scale, dropout_scale_attributes);
+                last_output                      = dropout_scale_output;
+            }
+        }
+
+        // Amax S
+        if (attributes.outputs.find(output_names::Amax_S) != attributes.outputs.end() &&
+            attributes.outputs.at(output_names::Amax_S) != nullptr) {
+            auto amax_attributes = Reduction_attributes().set_name("amax_s").set_mode(ReductionMode_t::AMAX);
+            // Special non-functional-style call. Needed because output already created and provided to user.
+            reduction(last_output, amax_attributes, attributes.outputs.at(output_names::Amax_S));
+        }
+
+        // Scale S
+        if (attributes.inputs.find(input_names::Scale_S) != attributes.inputs.end() &&
+            attributes.inputs.at(input_names::Scale_S) != nullptr) {
+            auto scale_s_attributes = Pointwise_attributes().set_name("scale_s").set_mode(PointwiseMode_t::MUL);
+            last_output = pointwise(last_output, attributes.inputs.at(input_names::Scale_S), scale_s_attributes);
+        }
+
+        // Lower attributes to bmm2 attributes
+        // Requirement by cudnn backend to take in bmm2 aType as i/o type.
+        last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
+
+        auto const& seq_len_q  = attributes.inputs[input_names::SEQ_LEN_Q];
+        auto const& seq_len_kv = attributes.inputs[input_names::SEQ_LEN_KV];
+        // auto const& V          = attributes.inputs[input_names::V];
+        auto const& O = attributes.outputs[output_names::O];
+
+        std::shared_ptr<Tensor_attributes> v_cache;
+
+        if (!is_paged_v()) {
+            v_cache = attributes.inputs[input_names::V];
+        } else {
+            auto paged_cache_load_attributes_v = PagedCacheLoad_attributes().set_name("paged_v_cache_operation");
+            v_cache                            = std::make_shared<Tensor_attributes>();
+            v_cache->set_dim({b, h_v, s_kv, d_v})
+                .set_stride({d_v * s_kv * h_v, d_v * s_kv, d_v, 1})
+                .set_data_type(attributes.inputs[input_names::V]->get_data_type());
+            v_cache->set_is_virtual(true);
+            paged_cache_load(attributes.inputs[input_names::V],
+                             attributes.inputs[input_names::SEQ_LEN_KV],
+                             attributes.inputs[input_names::Page_table_V],
+                             paged_cache_load_attributes_v,
+                             v_cache);
+        }
+
+        //// S * V
+        if (attributes.mma_core_mode == DataType_t::HALF) {
+            auto bmm2_attributes =
+                Matmul_attributes().set_name("bmm2").set_m_override(seq_len_q).set_k_override(seq_len_kv);
+            // Special non-functional-style call. Needed because output already created and provided to user.
+            matmul(last_output, v_cache, bmm2_attributes, O);
+        } else if (attributes.mma_core_mode == DataType_t::FP8_E4M3 ||
+                   attributes.mma_core_mode == DataType_t::FP8_E5M2) {
+            auto const& descale_s = attributes.inputs.at(input_names::Descale_S);
+            auto const& descale_v = attributes.inputs.at(input_names::Descale_V);
+            auto const& scale_o   = attributes.inputs.at(input_names::Scale_O);
+            auto const& amax_o    = attributes.outputs.at(output_names::Amax_O);
+
+            auto bmm2_attributes =
+                Matmul_fp8_attributes().set_name("bmm2").set_m_override(seq_len_q).set_k_override(seq_len_kv);
+            // Special non-functional-style call. Needed because output already created and provided to user.
+            matmul_fp8(last_output, v_cache, descale_s, descale_v, scale_o, bmm2_attributes, O, amax_o);
+        } else {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(true, error_code_t::GRAPH_NOT_SUPPORTED, "Unsupported MMA core mode");
+        }
+
+        return {error_code_t::OK, ""};
+    }
+};
+
+class CompositeSDPABackwardNode : public NodeCRTP<CompositeSDPABackwardNode> {
+    using input_names  = SDPA_backward_attributes::input_names;
+    using output_names = SDPA_backward_attributes::output_names;
+
+   private:
+    // non-virtual node gpu tensors
+    std::shared_ptr<Tensor_attributes> dQ_accum;
+    int64_t dQ_accum_size = 0;
+    std::shared_ptr<Tensor_attributes> dK_fullhead;
+    int64_t dK_fullhead_size = 0;
+    std::shared_ptr<Tensor_attributes> dV_fullhead;
+    int64_t dV_fullhead_size = 0;
+    std::shared_ptr<Tensor_attributes> softmax_sum;
+    int64_t softmax_sum_size = 0;
+    std::shared_ptr<Tensor_attributes> alibi_slopes;
+    int64_t alibi_slopes_size = 0;
+
+    mutable bool has_workaround_padding_mask         = false;  // Will be edited in pre_validate_node()
+    mutable int32_t s_q_for_workaround_padding_mask  = 0;      // Will be edited in pre_validate_node()
+    mutable int32_t s_kv_for_workaround_padding_mask = 0;      // Will be edited in pre_validate_node()
+    mutable std::shared_ptr<Tensor_attributes>
+        workaround_padding_mask_seq_len_q;  // Will be edited in pre_validate_node()
+    mutable std::shared_ptr<Tensor_attributes>
+        workaround_padding_mask_seq_len_kv;                                  // Will be edited in pre_validate_node()
+    mutable int64_t batch_size_for_workaround_padding_mask         = 0;      // Will be edited in pre_validate_node()
+    mutable bool is_deterministic_algorithm_supported_on_blackwell = false;  // Will be edited in pre_validate_node()
+
+   public:
+    mutable SDPA_backward_attributes attributes;  // Will be edited in pre_validate_node() for workaround padding mask
+
+    CompositeSDPABackwardNode(SDPA_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::COMPOSITE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating CompositeSDPABackwardNode" << attributes.name);
+
+        // check that Q, K, V, O, stats, dO, dQ, dK, dV tensors has been assigned
+        // check that dim and strides has been assigned and last stride is 1
+#define CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(port, port_map)                                                       \
+    {                                                                                                           \
+        std::shared_ptr<Tensor_attributes> tensor_ptr = port_map.at(port);                                      \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(tensor_ptr->get_dim().size() != 4,                                       \
+                                       error_code_t::ATTRIBUTE_NOT_SET,                                         \
+                                       "The dim for " + std::string(#port) + " is invalid");                    \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(tensor_ptr->get_stride().size() != 4,                                    \
+                                       error_code_t::ATTRIBUTE_NOT_SET,                                         \
+                                       "The stride for " + std::string(#port) + " is invalid");                 \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(                                                                         \
+            tensor_ptr->get_stride()[3] != 1,                                                                   \
+            error_code_t::GRAPH_NOT_SUPPORTED,                                                                  \
+            "The stride for the last dimension corresponding to the embedding size per head should be 1 for " + \
+                std::string(#port));                                                                            \
+    }
+
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::Q, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::K, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::V, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::O, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::Stats, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::dO, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::dQ, attributes.outputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::dK, attributes.outputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::dV, attributes.outputs);
+
+#undef CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE
+
+        // validate backend limitations for the operation
+        // clang-format off
+        int64_t s_q  = attributes.inputs.at(input_names::Q)->get_dim()[2];
+        int64_t s_kv = attributes.inputs.at(input_names::V)->get_dim()[2];
+        int64_t h_q  = attributes.inputs.at(input_names::Q)->get_dim()[1];
+        int64_t h_k  = attributes.inputs.at(input_names::K)->get_dim()[1];
+        int64_t h_v  = attributes.inputs.at(input_names::V)->get_dim()[1];
+        int64_t d_qk = attributes.inputs.at(input_names::Q)->get_dim()[3];
+        int64_t d_v  = attributes.inputs.at(input_names::V)->get_dim()[3];
+
+        bool const is_ragged = attributes.inputs.at(input_names::Q)->get_ragged_offset() ||
+                               attributes.inputs.at(input_names::K)->get_ragged_offset() ||
+                               attributes.inputs.at(input_names::V)->get_ragged_offset() ||
+                               attributes.inputs.at(input_names::O)->get_ragged_offset();
+
+        auto const& bias_mask = attributes.inputs.find(input_names::Bias);
+        bool const is_bias   = (bias_mask != attributes.inputs.end() && bias_mask->second != nullptr);
+        auto const& dbias_mask = attributes.outputs.find(output_names::dBias);
+        bool const is_dbias   = (dbias_mask != attributes.outputs.end() && dbias_mask->second != nullptr);
+
+        auto const& dropout_mask     = attributes.inputs.find(input_names::Dropout_mask);
+        bool const is_dropout_custom = (dropout_mask != attributes.inputs.end()) && (dropout_mask->second != nullptr);
+        bool const is_dropout        = attributes.dropout_probability.has_value() || is_dropout_custom;
+
+        auto const& rng_tensor = attributes.outputs.find(output_names::RNG_DUMP);
+        bool const is_rng   = (rng_tensor != attributes.outputs.end() && rng_tensor->second != nullptr);
+
+        // validation TODO:
+        //    - validate stats has valid dims
+        //    - validate Q and dQ have the same dims
+
+        // Stop s_q = S_kv = 1 from running
+        RETURN_CUDNN_FRONTEND_ERROR_IF(s_q == 1 && s_kv == 1,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "s_q = s_kv = 1 is not supported.");
+
+        cudaDeviceProp prop;
+        int device;
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device(&device));
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device_properties(&prop, device));
+
+        if (prop.major == 9) { 
+            // validate basic dimension requirements
+
+            if ((detail::get_backend_version() >= 91100) && (detail::get_backend_version() < 91300)) {
+                
+                if ((128 < d_qk) && (d_qk <= 192) && (64 < d_v) && (d_v <= 128)) {
+
+                    // DeepSeek case, 9.11 only supports 192 hidden dim
+                        RETURN_CUDNN_FRONTEND_ERROR_IF( (d_v != 128) && (d_qk != 192),
+                                                error_code_t::GRAPH_NOT_SUPPORTED,
+                                                "Num hidden_dim d_v should be equal to 128 if d_qk is 192");
+                }
+            }
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF((d_qk > 256) || (d_qk % 8 != 0) || (d_v > 256) || (d_v % 8 != 0),
+                        error_code_t::GRAPH_NOT_SUPPORTED,
+                        "Num hidden_dim should be less than or equal to 256 and hidden_dim should be multiple of 8");
+
+        } else if (prop.major == 10 && detail::get_backend_version() >= 91100) {
+            // validate basic dimension requirements
+            if (d_qk == 192) { // special case for 192 hidden dim
+                RETURN_CUDNN_FRONTEND_ERROR_IF( (d_v != 128),
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Num hidden_dim d_v should be equal to 128 if d_qk is 192");
+            } else {
+                RETURN_CUDNN_FRONTEND_ERROR_IF((d_qk > 128) || (d_qk % 8 != 0) || (d_v > 128) || (d_v % 8 != 0),
+                                            error_code_t::GRAPH_NOT_SUPPORTED,
+                                            "Num hidden_dim should be less than or equal to 128 and hidden_dim should be multiple of 8 when d_qk != d_v");
+            }
+        } else {
+            // validate basic dimension requirements
+            RETURN_CUDNN_FRONTEND_ERROR_IF((d_qk > 128) || (d_qk % 8 != 0) || (d_v > 128) || (d_v % 8 != 0),
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Num hidden_dim should be less than or equal to 128 and hidden_dim should be multiple of 8");
+        }
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF((attributes.attention_score_modifier != nullptr) &&
+                    (attributes.alibi_mask || attributes.padding_mask || attributes.has_causal_like_masking() ||
+                     attributes.left_bound.has_value()), error_code_t::GRAPH_NOT_SUPPORTED,"Attention score mod enabled and hence other subgraphs are disabled.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF((h_q % h_k != 0) || (h_q % h_v != 0),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For group-query attention, number of heads for key and query must be a factor of number of heads for query");
+
+        // validate options for attn_scale
+        auto const& attn_scale    = attributes.inputs.find(input_names::Attn_scale);
+        bool const has_attn_scale = (attn_scale != attributes.inputs.end()) && (attn_scale->second != nullptr);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(has_attn_scale && attributes.attn_scale_value.has_value(),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "attn_scale with tensor and value cannot be set at the same time.");
+
+        // validate alibi requirements
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.alibi_mask && !(attributes.right_bound.has_value() && attributes.right_bound.value() == 0),
+                        error_code_t::GRAPH_NOT_SUPPORTED,
+                        "When alibi mask is used, diagonal_band_right_bound needs to be set to 0.");
+
+        // validate options for bias mask
+        RETURN_CUDNN_FRONTEND_ERROR_IF(is_bias && (bias_mask->second->get_data_type() == DataType_t::BOOLEAN),
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Bias mask data type cannot be boolean");
+
+        if (s_kv % 128 != 0 && attributes.padding_mask == false && is_ragged == false && detail::get_backend_version() <= 91500) {
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: Workaround padding mask is enabled for s_q % 128 != 0 and use_padding_mask == false and is_ragged == false");
+            has_workaround_padding_mask = true;
+            batch_size_for_workaround_padding_mask = attributes.inputs.at(input_names::Q)->get_dim()[0];
+            s_q_for_workaround_padding_mask = s_q;
+            s_kv_for_workaround_padding_mask = s_kv;
+            workaround_padding_mask_seq_len_q = std::make_shared<Tensor_attributes>();
+            workaround_padding_mask_seq_len_q->set_name("workaround_padding_mask_seq_len_q").set_dim({batch_size_for_workaround_padding_mask,1,1,1}).set_stride({1,1,1,1}).set_data_type(DataType_t::INT32);
+            workaround_padding_mask_seq_len_kv = std::make_shared<Tensor_attributes>();
+            workaround_padding_mask_seq_len_kv->set_name("workaround_padding_mask_seq_len_kv").set_dim({batch_size_for_workaround_padding_mask,1,1,1}).set_stride({1,1,1,1}).set_data_type(DataType_t::INT32);
+            attributes.set_padding_mask(true);
+            attributes.set_seq_len_q(workaround_padding_mask_seq_len_q).set_seq_len_kv(workaround_padding_mask_seq_len_kv);
+        }
+
+        // validate options for padding mask
+        auto const& seq_len_q     = attributes.inputs.find(input_names::SEQ_LEN_Q);
+        bool const has_seq_len_q  = (seq_len_q != attributes.inputs.end()) && (seq_len_q->second != nullptr);
+        auto const& seq_len_kv    = attributes.inputs.find(input_names::SEQ_LEN_KV);
+        bool const has_seq_len_kv = (seq_len_kv != attributes.inputs.end()) && (seq_len_kv->second != nullptr);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.padding_mask && (!has_seq_len_q || !has_seq_len_kv),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Padding mask requires seq_len_q and seq_len_kv to be set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF((!attributes.padding_mask && !attributes.attention_score_modifier) && (has_seq_len_q || has_seq_len_kv),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "seq_len_q and seq_len_kv needs to be set only if padding mask is enabled.");
+
+        // validate options for max_total_seq_len
+        RETURN_CUDNN_FRONTEND_ERROR_IF((attributes.max_total_seq_len_q.has_value() || attributes.max_total_seq_len_kv.has_value()) && !is_ragged,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "max_total_seq_len_q is only supported with packed layout");
+
+        // validate options for bottom right causal mask
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.has_causal_mask_bottom_right() && (!attributes.padding_mask) && s_q > s_kv,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Bottom right causal mask does not support max_s_q > max_s_kv. Please virtually slice the Q tensor and pass it as max_s_q == max_s_kv");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.has_causal_mask_bottom_right() && (is_bias || attributes.alibi_mask || (is_ragged && !attributes.padding_mask) || is_dropout),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Bottom right causal mask is only supported with is_bias=False, is_alibi=False, is_dropout=False. Further is_ragged==True is only allowed when padding_mask=True.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.has_causal_mask_bottom_right() && (detail::get_backend_version() < 90600) && ((s_q % 64 != 0) || (s_kv % 64 != 0)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Bottom right causal mask is only supported with s_q multiple of 64, and s_kv multiple of 64, for cudnn version below 9.6.0");
+
+        // validate options for sliding window length
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.left_bound.has_value() && attributes.left_bound.value() <= 0,
+                                       error_code_t::INVALID_VALUE,
+                                       "Left bound (Sliding window length) should be greater than or equals to zero when set.");
+
+         RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.left_bound.has_value() && (s_q * attributes.left_bound.value() == s_kv * attributes.left_bound.value()) && (detail::get_backend_version() <= 90900) && (prop.major == 9) && attributes.has_causal_mask_bottom_right(),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "On Hopper architecture, this specific combination of s_q, s_kv, and left_bound + right_bound + bottom right diagonal alignment is not supported for backend version 9.9 or below");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.left_bound.has_value() && (!attributes.padding_mask) && s_q > s_kv,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Sliding window attention is only supported with max_s_q <= max_s_kv.");
+
+        if ((detail::get_backend_version() >= 91002)) {
+             RETURN_CUDNN_FRONTEND_ERROR_IF((attributes.left_bound.has_value() || attributes.right_bound.has_value()) && ((is_ragged && !attributes.padding_mask)),
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Left and right bounds with is_ragged==True is only allowed when padding_mask=True. And the diagonal alignment must be set.");
+        } else {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.left_bound.has_value() && (! attributes.has_causal_like_masking() || is_dropout || is_bias || (is_ragged && !attributes.padding_mask)),
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Left and right bounds are only supported with is_dropout=False, is_bias=False. Further is_ragged==True is only allowed when padding_mask=True. Lastly the diagonal alignment must be set.");
+        }
+        
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.right_bound.has_value() && attributes.right_bound.value() < 0,
+                                       error_code_t::INVALID_VALUE,
+                                       "Right bound needs to be larger than or equal to zero");
+
+        // validate options for dropout mask
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.dropout_probability.has_value() && is_dropout_custom,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Using both, custom dropout mask and internal-mask generation using dropout probability, is ill-formed.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.dropout_probability.has_value() && attributes.dropout_probability.value() == 1.0,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Dropout probability cannot be 1 as corresponding scale wont be well formed.");
+
+        // validate options for deterministic algorithm
+        if(attributes.is_deterministic_algorithm && (prop.major == 10)) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF( (detail::get_backend_version() < 91800),
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Deterministic algorithm is not supported on blackwell architecture with cudnn version below 9.18.0");
+
+            // dbias bias rng/dropout alibi
+            RETURN_CUDNN_FRONTEND_ERROR_IF(is_dbias || is_rng || is_dropout || attributes.alibi_mask,
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Deterministic algorithm is not supported on blackwell architecture when dbias, rng/dropout, alibi is enabled");
+
+            is_deterministic_algorithm_supported_on_blackwell = true;
+        }
+
+        if(detail::get_backend_version() >= 91801) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(is_ragged && (8 == prop.major || 12 == prop.major) && attributes.is_deterministic_algorithm,
+                                        error_code_t::GRAPH_NOT_SUPPORTED,
+                                        "Deterministic algorithm is not supported for bprop thd on SM8X and SM12X GPUs");
+        }
+
+        // version specific validation
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 8906 && ((s_kv % 64 != 0) || (d_qk % 64 != 0)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 8.9.6, s_kv not a multiple of 64 or d not a multiple of 64 is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 8907 && (s_kv % 64 != 0) && (!(attributes.padding_mask)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 8.9.7, s_kv not a multiple of 64 is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90000 && ((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || is_dropout),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.0.0, s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90000 && (s_q < 64),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+            "                          Sequence length must be greater than or equal to 64 for cudnn version prior to v9.0.0");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90200 && attributes.left_bound.has_value(),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.2.0, sliding window attention is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90500 && is_dbias && attributes.padding_mask,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.5.0, dBias with variable sequence lengths is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90500 && is_dbias && ((s_q % 64 != 0) || (s_kv % 64 != 0)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.5.0, dBias not support s_q/s_kv which aren't multiple of 64");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90600 && is_ragged && ((h_q != h_k) || (h_q != h_v)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.6.0, group-query attention with raggged offset is not supported");
+
+        // TODO add version check once fixed
+        RETURN_CUDNN_FRONTEND_ERROR_IF(prop.major == 10 && is_rng,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Dropout RNG dump is not supported for SM Major version 10");
+
+        // TODO add version check once fixed
+        RETURN_CUDNN_FRONTEND_ERROR_IF(prop.major == 10 && is_ragged && is_dbias,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "dbias with ragged is not supported for SM Major version 10");
+
+        // validate that datatype is set for the graph
+        RETURN_CUDNN_FRONTEND_ERROR_IF(this->context.get_intermediate_data_type() == DataType_t::NOT_SET,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Intermediate tensor data type needs to be set as internal tensors require it.");
+        // If dsink is set, sink also needs to be set
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.outputs.find(output_names::DSINK_TOKEN) != attributes.outputs.end() && attributes.inputs.find(input_names::SINK_TOKEN) == attributes.inputs.end(),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "If dsink is set, sink also needs to be set.");
+        // clang-format on
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        // clang-format off
+        if (detail::get_backend_version() < 90600 && (attributes.max_total_seq_len_q.has_value() || attributes.max_total_seq_len_kv.has_value())) {
+            CUDNN_FE_LOG_LABEL_ENDL("WARNING: sdpa_backward.attributes.max_total_seq_len has been set, but cuDNN version is below 9.6.0 does not support max_total_seq_len_q. The workspace memory size required to execute this graph may be unexpectedly large");
+            attributes.max_total_seq_len_q.reset();
+            attributes.max_total_seq_len_kv.reset();
+        }
+
+        // TODO add version check once fixed
+        int64_t d_qk = attributes.inputs.at(input_names::Q)->get_dim()[3];
+        int64_t d_v  = attributes.inputs.at(input_names::V)->get_dim()[3];
+        if ((attributes.max_total_seq_len_q.has_value() || attributes.max_total_seq_len_kv.has_value()) && (d_qk % 16 != 0 || d_v % 16 != 0)) {
+            CUDNN_FE_LOG_LABEL_ENDL("WARNING: sdpa_backward.attributes.max_total_seq_len has been set, but d is not a multiple of 16 has a known functional issue. The workspace memory size required to execute this graph may be unexpectedly large");
+            attributes.max_total_seq_len_q.reset();
+            attributes.max_total_seq_len_kv.reset();
+        }
+
+
+        if(detail::get_backend_version() >= 91801) {
+            cudaDeviceProp prop;
+            int device;
+            _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device(&device));
+            _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device_properties(&prop, device));
+            if((8 == prop.major || 12 == prop.major) && (attributes.max_total_seq_len_q.has_value() || attributes.max_total_seq_len_kv.has_value())) {
+                attributes.max_total_seq_len_q.reset();
+                attributes.max_total_seq_len_kv.reset();
+            }
+        }
+        // clang-format on
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    expand_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for CompositeSDPABackwardNode " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // Gather dim to fill properties of virtual tensors
+        auto const& q_dim = attributes.inputs[input_names::Q]->get_dim();
+        auto b            = q_dim[0];
+        auto h_q          = q_dim[1];
+        auto s_q          = q_dim[2];
+        auto d_qk         = q_dim[3];
+        auto const& k_dim = attributes.inputs[input_names::K]->get_dim();
+        auto h_k          = k_dim[1];
+        auto s_kv         = k_dim[2];
+        auto const& v_dim = attributes.inputs[input_names::V]->get_dim();
+        auto h_v          = v_dim[1];
+        auto d_v          = v_dim[3];
+
+        // cuDNN frontend API attention requires Q, K, V where
+        // Q = {b, h_q, s_q, d_qk}
+        // K = {b, h_k, s_kv, d_qk}
+        // V = {b, h_v, s_kv, d_v}
+        // but cuDNN backend API attention requires Q, KT, VT
+        // Q = {b, h_q, s_q, d_qk}
+        // KT = {b, h_k, d_qk, s_kv}
+        // VT = {b, h_v, d_v, s_kv}
+        // So the code below maps the K->KT and V->VT
+        std::vector<int64_t> temp_vec;
+
+        temp_vec = attributes.inputs[input_names::K]->get_dim();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::K]->set_dim(temp_vec);
+
+        temp_vec = attributes.inputs[input_names::K]->get_stride();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::K]->set_stride(temp_vec);
+
+        temp_vec = attributes.inputs[input_names::V]->get_dim();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::V]->set_dim(temp_vec);
+
+        temp_vec = attributes.inputs[input_names::V]->get_stride();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::V]->set_stride(temp_vec);
+
+        std::shared_ptr<Tensor_attributes> last_output, exp_s_output, dS_output, rng_output;
+
+        // --------------Initialize and create tensors before creating nodes--------------------
+        // one_tensor is needed for non-dropout graphs
+        // one_tensor is passed by the node
+        auto one_tensor = std::make_shared<Tensor_attributes>(1.0f);
+
+        if (attributes.attn_scale_value.has_value()) {
+            attributes.inputs[input_names::Attn_scale] =
+                std::make_shared<Tensor_attributes>(attributes.attn_scale_value.value());
+        }
+
+        // if dropout_mask is used, then the user passes scale and scale_inverse
+        bool is_dropout_prob = (attributes.dropout_probability.has_value());
+        bool is_dropout_mask = (attributes.inputs[input_names::Dropout_mask] != nullptr);
+        if (is_dropout_prob) {
+            float dropout_scale_value     = 1.0f / (1.0f - attributes.dropout_probability.value());
+            float dropout_scale_inv_value = (1.0f - attributes.dropout_probability.value());
+
+            attributes.inputs[input_names::Dropout_scale] = std::make_shared<Tensor_attributes>(dropout_scale_value);
+            attributes.inputs[input_names::Dropout_scale_inv] =
+                std::make_shared<Tensor_attributes>(dropout_scale_inv_value);
+        }
+
+        // ---------------------input tensor workarounds---------------------------
+
+        bool use_dp_workspace = false;
+
+        cudaDeviceProp prop;
+        if (context.get_sm_version() > 0) {
+            prop.major = context.get_sm_version() / 10;
+        } else {
+            int device;
+            _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device(&device));
+            _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device_properties(&prop, device));
+        }
+
+        if (detail::get_backend_version() >= 8905 && detail::get_backend_version() < 90000) {
+            // workspace optimization is enabled by default when:
+            //   8.9.5 <= cudnn version < 9.0.0
+            //   device >= hopper
+            //   batch * num_heads * seq_len_q * seq_len_kv * 2 <= dP workspace limit
+            //
+            // This following environment variable allows you to control the dP workspace limit.
+            // From cuDNN version 9.0.0, this option is obsolete will be ignored.
+            // CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT=unset  - enable workspace opt. until the default 256MB limit.
+            // CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT=-1     - always enable workspace opt.
+            // CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT=0      - always disable workspace opt.
+            // CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT=n      - enable workspace opt. until the n byte limit
+
+            // hopper or above
+            if (prop.major >= 9) {
+                // default upper limit for workspace 256MB
+                int64_t max_dp_workspace_bytes = 256 * 1024 * 1024;
+
+                // allow setting the upper limit with envvars
+                char* env_dp_workspace_limit_char = std::getenv("CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT");
+                if (env_dp_workspace_limit_char) {
+                    char* end_ptr          = nullptr;
+                    max_dp_workspace_bytes = std::strtoll(env_dp_workspace_limit_char, &end_ptr, 10);
+
+                    if (*end_ptr != '\0') {
+                        RETURN_CUDNN_FRONTEND_ERROR_IF(true,
+                                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                                       "Invalid argument for CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT "
+                                                       "(int64_t; in bytes)");
+                    }
+                }
+
+                int64_t workspace_s_q               = ((s_q + 64 - 1) / 64) * 64;
+                int64_t workspace_s_kv              = ((s_kv + 64 - 1) / 64) * 64;
+                int64_t required_dp_workspace_bytes = b * h_q * workspace_s_q * workspace_s_kv * 2;
+
+                if (max_dp_workspace_bytes == -1) {
+                    use_dp_workspace = true;
+                } else if (max_dp_workspace_bytes == 0) {
+                    use_dp_workspace = false;
+                } else {
+                    use_dp_workspace = (required_dp_workspace_bytes <= max_dp_workspace_bytes);
+                }
+            }
+        }
+
+        // Force dP workspace implementation if:
+        //  - dBias is enabled (dBias is only supported on workspace implementation)
+        //  - the user force requests deterministic algorithm on hopper
+        if (attributes.outputs[output_names::dBias] || attributes.is_deterministic_algorithm) {
+            use_dp_workspace = true;
+        }
+
+        // --------------RNG node--------------------
+
+        if (is_dropout_prob) {
+            if (attributes.outputs[output_names::RNG_DUMP] != nullptr) {
+                rng_output = attributes.outputs[output_names::RNG_DUMP];
+                rng(attributes.inputs[input_names::Seed],
+                    attributes.inputs[input_names::Offset],
+                    Rng_attributes()
+                        .set_name("rng")
+                        .set_distribution(RngDistribution_t::BERNOULLI)
+                        .set_bernoulli_probability(1.0f - attributes.dropout_probability.value()),
+                    rng_output);
+            } else {
+                rng_output = rng(attributes.inputs[input_names::Seed],
+                                 attributes.inputs[input_names::Offset],
+                                 Rng_attributes()
+                                     .set_name("rng")
+                                     .set_distribution(RngDistribution_t::BERNOULLI)
+                                     .set_bernoulli_probability(1.0f - attributes.dropout_probability.value()));
+                rng_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
+            }
+        } else if (is_dropout_mask) {
+            rng_output = attributes.inputs[input_names::Dropout_mask];
+        }
+
+        // --------------"dO * o => softmax_sum" chain--------------------
+
+        // last_output = dO * O
+        last_output = pointwise(attributes.inputs[input_names::dO],
+                                attributes.inputs[input_names::O],
+                                Pointwise_attributes().set_name("mul_dO_O").set_mode(PointwiseMode_t::MUL));
+        last_output->set_dim({b, h_q, s_q, d_v}).set_stride({h_q * s_q * d_v, s_q * d_v, h_q * d_v, 1});
+
+        // last_output = reduce(last_output, "b hq sq dv -> b hq sq 1")
+        last_output =
+            reduction(last_output, Reduction_attributes().set_name("reduce_dO_o").set_mode(ReductionMode_t::ADD));
+        last_output->set_dim({b, h_q, s_q, 1}).set_stride({h_q * s_q, s_q, 1, 1});
+
+        if (attributes.outputs.find(output_names::DSINK_TOKEN) != attributes.outputs.end()) {
+            // sub_sink = sink - stats
+            auto sub_sink = pointwise(attributes.inputs[input_names::SINK_TOKEN],
+                                      attributes.inputs[input_names::Stats],
+                                      Pointwise_attributes().set_name("sub_sink").set_mode(PointwiseMode_t::SUB));
+
+            // exp_sink = exp(sub_sink)
+            auto exp_sink =
+                pointwise(sub_sink, Pointwise_attributes().set_name("exp_sink").set_mode(PointwiseMode_t::EXP));
+
+            // per_token_grad = exp_sink * last_output
+            auto per_token_grad =
+                pointwise(exp_sink,
+                          last_output,
+                          Pointwise_attributes().set_name("mul_exp_sink_last_output").set_mode(PointwiseMode_t::MUL));
+
+            // dSink = redduce(per_token_grad)
+            reduction(per_token_grad,
+                      Reduction_attributes().set_name("reduce_per_token_grad").set_mode(ReductionMode_t::ADD),
+                      attributes.outputs[output_names::DSINK_TOKEN]);
+        }
+
+        // softmax_sum = last_output * dropout_scale
+        last_output = pointwise(last_output,
+                                attributes.inputs[input_names::Dropout_scale_inv]
+                                    ? attributes.inputs[input_names::Dropout_scale_inv]
+                                    : one_tensor,
+                                Pointwise_attributes().set_name("scale_dropout_inv").set_mode(PointwiseMode_t::MUL));
+        last_output->set_dim({b, h_q, s_q, 1}).set_stride({h_q * s_q, s_q, 1, 1});
+
+        softmax_sum = last_output;
+        softmax_sum->set_is_virtual(false);
+        softmax_sum->set_dim({b, h_q, s_q, 1});
+        softmax_sum->set_data_type(DataType_t::FLOAT);
+
+        if (attributes.inputs[input_names::Stats]->get_ragged_offset() && attributes.max_total_seq_len_q.has_value()) {
+            // sized TH1 softmax_sum
+            softmax_sum->set_stride(attributes.inputs[input_names::Stats]->get_stride());
+            softmax_sum->set_ragged_offset(attributes.inputs[input_names::Stats]->get_ragged_offset());
+            softmax_sum_size = attributes.max_total_seq_len_q.value() *
+                               (attributes.inputs[input_names::Stats]->get_stride())[2] * sizeof(float);
+        } else {
+            // sized BHS1 softmax_sum
+            softmax_sum->set_stride({h_q * s_q, s_q, 1, 1});
+            softmax_sum_size = b * h_q * s_q * 1 * sizeof(float);
+        }
+
+        // --------------"Q @ KT => exp_softmax => dV" chain--------------------
+
+        // s = einsum(q, k, "b hq sq dqk, b (hk g) skv dqk -> b hq sq skv", g=hq//hk)
+        last_output = matmul(attributes.inputs[input_names::Q],
+                             attributes.inputs[input_names::K],
+                             Matmul_attributes()
+                                 .set_name("matmul_Q_KT")
+                                 .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                                 .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV]));
+        last_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
+
+        // last_output = last_output * attention_scale
+        if (attributes.inputs[input_names::Attn_scale]) {
+            last_output = pointwise(last_output,
+                                    attributes.inputs[input_names::Attn_scale],
+                                    Pointwise_attributes().set_name("mul_s_attn_scale").set_mode(PointwiseMode_t::MUL));
+        }
+
+        if (attributes.attention_score_modifier != nullptr) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = context;
+            last_output                  = attributes.attention_score_modifier(graph_, last_output);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // (optional) last_output = last_output + bias
+        if (attributes.inputs.find(input_names::Bias) != attributes.inputs.end() &&
+            attributes.inputs[input_names::Bias]) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = context;
+            last_output = attn::score_modifiers::bias(graph_, last_output, attributes.inputs[input_names::Bias]);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // (optional) last_output = last_output + alibi_mask
+        if (attributes.alibi_mask) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = context;
+            last_output = attn::score_modifiers::alibi_mask(graph_, last_output, alibi_slopes, h_q, alibi_slopes_size);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // (optional) Apply padding mask
+        if (attributes.padding_mask) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = context;
+            last_output                  = attn::score_modifiers::padding_mask(graph_,
+                                                              last_output,
+                                                              attributes.inputs[input_names::SEQ_LEN_KV],
+                                                              attributes.inputs[input_names::SEQ_LEN_Q]);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // last_output = last_output - stats
+        last_output = pointwise(last_output,
+                                attributes.inputs[input_names::Stats],
+                                Pointwise_attributes().set_name("sub_s_m").set_mode(PointwiseMode_t::SUB));
+
+        // WAR for bug 4475073 by explicitly putting the padding value again after the stats have been loaded
+        if (attributes.padding_mask && detail::get_backend_version() >= 90000 &&
+            detail::get_backend_version() < 91000) {
+            auto row_idx_output = pointwise(last_output,
+                                            Pointwise_attributes()
+                                                .set_name("gen_row_idx_2nd_padding")
+                                                .set_mode(PointwiseMode_t::GEN_INDEX)
+                                                .set_axis(2)
+                                                .set_compute_data_type(DataType_t::INT32));
+            row_idx_output->set_data_type(DataType_t::INT32);
+
+            auto col_idx_output = pointwise(last_output,
+                                            Pointwise_attributes()
+                                                .set_name("gen_col_idx_2nd_padding")
+                                                .set_mode(PointwiseMode_t::GEN_INDEX)
+                                                .set_axis(3)
+                                                .set_compute_data_type(DataType_t::INT32));
+            col_idx_output->set_data_type(DataType_t::INT32);
+
+            auto row_mask_output = pointwise(row_idx_output,
+                                             attributes.inputs[input_names::SEQ_LEN_Q],
+                                             Pointwise_attributes()
+                                                 .set_name("lt_row_sq_2nd_padding")
+                                                 .set_mode(PointwiseMode_t::CMP_LT)
+                                                 .set_compute_data_type(DataType_t::BOOLEAN));
+            row_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+            auto col_mask_output = pointwise(col_idx_output,
+                                             attributes.inputs[input_names::SEQ_LEN_KV],
+                                             Pointwise_attributes()
+                                                 .set_name("lt_col_skv_2nd_padding")
+                                                 .set_mode(PointwiseMode_t::CMP_LT)
+                                                 .set_compute_data_type(DataType_t::BOOLEAN));
+            col_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+            auto padding_mask_output = pointwise(row_mask_output,
+                                                 col_mask_output,
+                                                 Pointwise_attributes()
+                                                     .set_name("and_row_col_2nd_padding")
+                                                     .set_mode(PointwiseMode_t::LOGICAL_AND)
+                                                     .set_compute_data_type(DataType_t::BOOLEAN));
+            padding_mask_output->set_data_type(DataType_t::BOOLEAN);
+            auto negative_inf_padding =
+                std::make_shared<Tensor_attributes>(attn::score_modifiers::get_negative_inf_value());
+
+            last_output = pointwise(
+                last_output,
+                negative_inf_padding,
+                padding_mask_output,
+                Pointwise_attributes().set_name("select_2nd_padding").set_mode(PointwiseMode_t::BINARY_SELECT));
+        }
+
+        // Apply (bottom-right) causal masking (with right bound) and/or set the left bound
+        if (attributes.left_bound.has_value() || attributes.right_bound.has_value()) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = context;
+
+            auto s_kv_ptr = attributes.inputs.find(input_names::SEQ_LEN_KV) != attributes.inputs.end()
+                                ? attributes.inputs[input_names::SEQ_LEN_KV]
+                                : nullptr;
+            auto s_q_ptr  = attributes.inputs.find(input_names::SEQ_LEN_Q) != attributes.inputs.end()
+                                ? attributes.inputs[input_names::SEQ_LEN_Q]
+                                : nullptr;
+
+            last_output = attn::score_modifiers::sliding_window_mask(graph_,
+                                                                     last_output,
+                                                                     attributes.diagonal_alignment,
+                                                                     attributes.left_bound,
+                                                                     attributes.right_bound,
+                                                                     s_q,
+                                                                     s_kv,
+                                                                     s_q_ptr,
+                                                                     s_kv_ptr);
+            sub_nodes.emplace_back(std::move(node_));
+        }
+
+        // last_output = exp(last_output)
+        last_output = pointwise(last_output, Pointwise_attributes().set_name("exp_s").set_mode(PointwiseMode_t::EXP));
+
+        exp_s_output = last_output;
+
+        // (optional) last_output = last_output * dropout rng_output
+        if (is_dropout_prob || is_dropout_mask) {
+            last_output =
+                pointwise(last_output,
+                          rng_output,
+                          Pointwise_attributes().set_name("mul_p_dropout_mask").set_mode(PointwiseMode_t::MUL));
+        }
+
+        // (optional) last_output = last_output * dropout_scale
+        if (attributes.inputs[input_names::Dropout_scale]) {
+            last_output =
+                pointwise(last_output,
+                          attributes.inputs[input_names::Dropout_scale],
+                          Pointwise_attributes().set_name("mul_p_dropout_scale").set_mode(PointwiseMode_t::MUL));
+        }
+
+        // dV = einsum(p, dO, "b hq sq skv", "b hq sq dv -> b hq skv dv")
+        // if GQA, then dV = reduce(dV, "b (hv g) skv dv -> b hv skv dv", g=hq//hv)
+        // as reshape + matmul
+        last_output = reshape(last_output, Reshape_attributes().set_name("reshape_p"));
+        last_output->set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv});
+        last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
+
+        if (h_q == h_v) {
+            // for MHA
+            matmul(last_output,
+                   attributes.inputs[input_names::dO],
+                   Matmul_attributes()
+                       .set_name("matmul_pT_dO")
+                       .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
+                       .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]),
+                   attributes.outputs[output_names::dV]);
+        } else {
+            // for GQA and MQA
+            dV_fullhead = matmul(last_output,
+                                 attributes.inputs[input_names::dO],
+                                 Matmul_attributes()
+                                     .set_name("matmul_pT_dO")
+                                     .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
+                                     .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]));
+
+            dV_fullhead->set_dim({b, h_q, s_kv, d_v});
+            dV_fullhead->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
+
+            if (attributes.outputs[output_names::dV]->get_ragged_offset() &&
+                attributes.max_total_seq_len_kv.has_value()) {
+                // hack 1 - map dV strides to dV_fullhead strides
+                std::vector<int64_t> dV_fullhead_stride = attributes.outputs[output_names::dV]->get_stride();
+                dV_fullhead_stride[2]                   = dV_fullhead_stride[2] * (h_q / h_v);  // sequence stride
+                dV_fullhead_stride[0]                   = dV_fullhead_stride[0] * (h_q / h_v);  // batch stride
+                dV_fullhead->set_stride(dV_fullhead_stride);
+                // hack 2 - map dV ragged offset to dV_fullhead ragged offset with implicit multiplier
+                // implicit multiplier = h_q / h_v
+                dV_fullhead->set_ragged_offset(attributes.outputs[output_names::dV]->get_ragged_offset());
+                // hack 3 - non virtual dV full head
+                dV_fullhead->set_is_virtual(false);
+                dV_fullhead_size = attributes.max_total_seq_len_kv.value() * dV_fullhead_stride[2] * sizeof(float);
+            } else {
+                // sized BHSD dQ_accum
+                dV_fullhead->set_stride({h_q * s_kv * d_v, s_kv * d_v, d_v, 1});
+            }
+
+            reduction(dV_fullhead,
+                      Reduction_attributes().set_name("red_dV_head").set_mode(ReductionMode_t::ADD),
+                      attributes.outputs[output_names::dV]);
+        }
+
+        // --------------"dO @ VT => dS_output => dK" chain--------------------
+
+        // dP = einsum(dO, v, "b hq sq dv, b (hv g) skv dv -> b hq sq skv", g=hq//hv)
+        last_output = matmul(attributes.inputs[input_names::dO],
+                             attributes.inputs[input_names::V],
+                             Matmul_attributes()
+                                 .set_name("matmul_dO_VT")
+                                 .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                                 .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV]));
+        last_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
+
+        // last_output = last_output(dP) * mask
+        if (is_dropout_prob || is_dropout_mask) {
+            last_output = pointwise(last_output,
+                                    rng_output,
+                                    Pointwise_attributes().set_name("dP_dropout_mask").set_mode(PointwiseMode_t::MUL));
+        }
+
+        // last_output = last_output - softmax_sum
+        last_output = pointwise(last_output,
+                                softmax_sum,
+                                Pointwise_attributes().set_name("sub_dP_softmax_sum").set_mode(PointwiseMode_t::SUB));
+
+        // last_output = last_output * exp_s_output
+        last_output = pointwise(
+            last_output, exp_s_output, Pointwise_attributes().set_name("mul_dP_exp_s").set_mode(PointwiseMode_t::MUL));
+
+        // (optional) last_output = last_output * dropout_scale
+        if (attributes.inputs[input_names::Dropout_scale]) {
+            last_output =
+                pointwise(last_output,
+                          attributes.inputs[input_names::Dropout_scale],
+                          Pointwise_attributes().set_name("mul_dS_dropout_scale").set_mode(PointwiseMode_t::MUL));
+        }
+
+        if (attributes.outputs[output_names::dBias]) {
+            reduction(last_output,
+                      Reduction_attributes().set_name("red_dP_dBias").set_mode(ReductionMode_t::ADD),
+                      attributes.outputs[output_names::dBias]);
+        }
+
+        // apply the bprop of attention score modifier
+        if (attributes.attention_score_modifier_bprop != nullptr) {
+            auto graph_                  = std::make_shared<Graph>();
+            std::shared_ptr<INode> node_ = std::static_pointer_cast<INode>(graph_);
+            node_->context               = context;
+            last_output                  = attributes.attention_score_modifier_bprop(graph_, last_output);
+            sub_nodes.emplace_back(node_);
+        }
+
+        // (optional) last_output = last_output * bmm_scale
+        if (attributes.inputs[input_names::Attn_scale]) {
+            last_output =
+                pointwise(last_output,
+                          attributes.inputs[input_names::Attn_scale],
+                          Pointwise_attributes().set_name("mul_dS_attn_scale").set_mode(PointwiseMode_t::MUL));
+        }
+
+        dS_output = last_output;
+
+        // dK = einsum(dS, Q, "b hq sq skv", "b hq sq dqk -> b hq skv dqk")
+        // if GQA, then dK = reduce(dK, "b (hk g) skv dqk -> b hk skv dqk", hq//hk)
+        // as reshape + matmul
+        last_output = reshape(last_output, Reshape_attributes().set_name("reshape_dS"));
+        last_output->set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv});
+        last_output->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
+
+        if (h_q == h_k) {
+            // for MHA
+            matmul(last_output,
+                   attributes.inputs[input_names::Q],
+                   Matmul_attributes()
+                       .set_name("matmul_dST_Q")
+                       .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
+                       .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]),
+                   attributes.outputs[output_names::dK]);
+        } else {
+            // for GQA and MQA
+            dK_fullhead = matmul(last_output,
+                                 attributes.inputs[input_names::Q],
+                                 Matmul_attributes()
+                                     .set_name("matmul_dST_Q")
+                                     .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
+                                     .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]));
+
+            dK_fullhead->set_dim({b, h_q, s_kv, d_qk});
+            dK_fullhead->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
+
+            if (attributes.outputs[output_names::dK]->get_ragged_offset() &&
+                attributes.max_total_seq_len_kv.has_value()) {
+                // sized THD dK_full_heads
+                // hack 1 - map dK strides to dK_fullhead strides
+                std::vector<int64_t> dK_fullhead_stride = attributes.outputs[output_names::dK]->get_stride();
+                dK_fullhead_stride[0]                   = dK_fullhead_stride[0] * (h_q / h_k);  // batch stride
+                dK_fullhead_stride[2]                   = dK_fullhead_stride[2] * (h_q / h_k);  // sequence stride
+                dK_fullhead->set_stride(dK_fullhead_stride);
+                // hack 2 - map dK ragged offset to dK_fullhead ragged offset with implicit multiplier
+                // implicit multiplier = h_q / h_k
+                dK_fullhead->set_ragged_offset(attributes.outputs[output_names::dK]->get_ragged_offset());
+                // hack 3 - non virtual dK full head
+                dK_fullhead->set_is_virtual(false);
+                dK_fullhead_size = attributes.max_total_seq_len_kv.value() * dK_fullhead_stride[2] * sizeof(float);
+            } else {
+                // sized BHSD dQ_accum
+                dK_fullhead->set_stride({h_q * s_kv * d_qk, s_kv * d_qk, d_qk, 1});
+            }
+
+            reduction(dK_fullhead,
+                      Reduction_attributes().set_name("red_dK_head").set_mode(ReductionMode_t::ADD),
+                      attributes.outputs[output_names::dK]);
+        }
+
+        // --------------"dp_scaled @ K => dQ" chain--------------------
+
+        auto const& kt_dim    = attributes.inputs[input_names::K]->get_dim();
+        auto const& kt_stride = attributes.inputs[input_names::K]->get_stride();
+
+        // dQ = einsum(dS, K, "b hq sq skv, b (hk g) skv dqk -> b hq sq dqk", g=hq//hk)
+        // as reshape + matmul
+        last_output = reshape(attributes.inputs[input_names::K], Reshape_attributes().set_name("reshape_k"));
+        last_output->set_dim({kt_dim[0], kt_dim[1], kt_dim[3], kt_dim[2]})
+            .set_stride({kt_stride[0], kt_stride[1], kt_stride[3], kt_stride[2]});
+
+        if (attributes.inputs[input_names::K]->get_ragged_offset() != nullptr) {
+            last_output->set_ragged_offset(attributes.inputs[input_names::K]->get_ragged_offset());
+        }
+
+        if (!use_dp_workspace) {
+            dQ_accum = std::make_shared<Tensor_attributes>();
+            dQ_accum->set_is_virtual(false);
+            dQ_accum->set_dim({b, h_q, s_q, d_qk});
+            dQ_accum->set_data_type(DataType_t::FLOAT);
+
+            if (attributes.outputs[output_names::dQ]->get_ragged_offset() &&
+                attributes.max_total_seq_len_q.has_value()) {
+                // sized THD dQ_accum
+                dQ_accum->set_stride(attributes.outputs[output_names::dQ]->get_stride());
+                dQ_accum->set_ragged_offset(attributes.outputs[output_names::dQ]->get_ragged_offset());
+                dQ_accum_size = attributes.max_total_seq_len_q.value() *
+                                (attributes.outputs[output_names::dQ]->get_stride())[2] * sizeof(float);
+            } else {
+                // sized BHSD dQ_accum
+                dQ_accum->set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1});
+                dQ_accum_size = b * h_q * s_q * d_qk * sizeof(float);
+            }
+
+            matmul(dS_output,
+                   last_output,
+                   Matmul_attributes()
+                       .set_name("matmul_dS_K")
+                       .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                       .set_k_override(attributes.inputs[input_names::SEQ_LEN_KV]),
+                   dQ_accum);
+
+            pointwise(dQ_accum,
+                      Pointwise_attributes().set_name("identity_dQ").set_mode(PointwiseMode_t::IDENTITY),
+                      attributes.outputs[output_names::dQ]);
+        } else {
+            matmul(dS_output,
+                   last_output,
+                   Matmul_attributes()
+                       .set_name("matmul_dS_K")
+                       .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                       .set_k_override(attributes.inputs[input_names::SEQ_LEN_KV]),
+                   attributes.outputs[output_names::dQ]);
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    std::pair<int64_t, std::unordered_map<KnobType_t, int64_t>>
+    override_heuristics_query() const {
+        if (is_deterministic_algorithm_supported_on_blackwell) {
+            return {5, {{KnobType_t::KERNEL_CFG, 31}, {KnobType_t::STAGES, 2}}};
+        } else {
+            return {-1, {}};
+        }
+    }
+
+    virtual int64_t
+    get_fe_workspace_size_node() const override final {
+        int64_t size = 0;
+
+        size += ((alibi_slopes_size + 15) / 16 * 16);  // align alibi slopes memory to 16 bytes
+        size += dQ_accum_size;
+        size += dK_fullhead_size;
+        size += dV_fullhead_size;
+        size += softmax_sum_size;
+
+        if (has_workaround_padding_mask) {
+            size += batch_size_for_workaround_padding_mask * sizeof(int32_t) * 2;
+        }
+
+        return size;
+    }
+
+    virtual error_t
+    collect_tensors_in_workspace_node(
+        std::unordered_map<Tensor_attributes::uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>&
+            workspace_modifications,
+        int64_t& offset) const override final {
+        if (attributes.alibi_mask) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Q, input_names::Q);
+            int64_t const h_q     = Q->second->get_dim()[1];
+            auto alibi_slopes_vec = detail::get_alibi_slope(h_q);
+            workspace_modifications.emplace(alibi_slopes->get_uid(), std::make_tuple(0, offset, alibi_slopes_vec));
+            int64_t alibi_slopes_size_padded = ((alibi_slopes_size + 15) / 16 * 16);
+            offset                           = offset + alibi_slopes_size_padded;
+        }
+
+        if (dQ_accum && !dQ_accum->get_is_virtual()) {
+            if (detail::get_backend_version() < 90600) {
+                // prior to cuDNN 9.6.0, dQ_accum needed to be memset by frontend
+                workspace_modifications.emplace(dQ_accum->get_uid(),
+                                                std::make_tuple(1, offset, std::vector<float>{(float)dQ_accum_size}));
+            } else {
+                workspace_modifications.emplace(dQ_accum->get_uid(), std::make_tuple(2, offset, std::vector<float>()));
+            }
+            offset = offset + dQ_accum_size;
+        }
+
+        if (dK_fullhead && !dK_fullhead->get_is_virtual()) {
+            workspace_modifications.emplace(dK_fullhead->get_uid(), std::make_tuple(2, offset, std::vector<float>()));
+            offset = offset + dK_fullhead_size;
+        }
+
+        if (dV_fullhead && !dV_fullhead->get_is_virtual()) {
+            workspace_modifications.emplace(dV_fullhead->get_uid(), std::make_tuple(2, offset, std::vector<float>()));
+            offset = offset + dV_fullhead_size;
+        }
+
+        if (softmax_sum && !softmax_sum->get_is_virtual()) {
+            workspace_modifications.emplace(softmax_sum->get_uid(), std::make_tuple(2, offset, std::vector<float>()));
+            offset = offset + softmax_sum_size;
+        }
+
+        if (has_workaround_padding_mask) {
+            CUDNN_FE_LOG_LABEL_ENDL("INFO: Collecting workaround padding mask tensors with batch size "
+                                    << batch_size_for_workaround_padding_mask << " with UIDs "
+                                    << workaround_padding_mask_seq_len_q->get_uid() << " and "
+                                    << workaround_padding_mask_seq_len_kv->get_uid());
+            std::vector<int32_t> workaround_padding_mask_seq_len_q_vec(batch_size_for_workaround_padding_mask,
+                                                                       s_q_for_workaround_padding_mask);
+            std::vector<int32_t> workaround_padding_mask_seq_len_kv_vec(batch_size_for_workaround_padding_mask,
+                                                                        s_kv_for_workaround_padding_mask);
+
+            // reinterpret_cast the int32_t vector data to float vector for workspace_modifications
+            std::vector<float> workaround_padding_mask_seq_len_q_vec_float(
+                reinterpret_cast<float*>(workaround_padding_mask_seq_len_q_vec.data()),
+                reinterpret_cast<float*>(workaround_padding_mask_seq_len_q_vec.data()) +
+                    batch_size_for_workaround_padding_mask);
+            std::vector<float> workaround_padding_mask_seq_len_kv_vec_float(
+                reinterpret_cast<float*>(workaround_padding_mask_seq_len_kv_vec.data()),
+                reinterpret_cast<float*>(workaround_padding_mask_seq_len_kv_vec.data()) +
+                    batch_size_for_workaround_padding_mask);
+
+            workspace_modifications.emplace(workaround_padding_mask_seq_len_q->get_uid(),
+                                            std::make_tuple(0, offset, workaround_padding_mask_seq_len_q_vec_float));
+            offset = offset + batch_size_for_workaround_padding_mask * sizeof(float);
+            workspace_modifications.emplace(workaround_padding_mask_seq_len_kv->get_uid(),
+                                            std::make_tuple(0, offset, workaround_padding_mask_seq_len_kv_vec_float));
+            offset = offset + batch_size_for_workaround_padding_mask * sizeof(float);
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "SDPA_BWD"})"_json);
+    }
+#endif
+};
+
+class UnifiedSDPANode : public SDPANodeBase<UnifiedSDPANode> {
+   public:
+    UnifiedSDPANode(SDPA_attributes&& attributes_, detail::Context const& context)
+        : SDPANodeBase(std::move(attributes_), context) {}
+
+    Type
+    getType() override final {
+        return Type::UNIFIED_SDPA;
+    }
+
+    error_t
+    expand_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for UnifiedSDPANode node  " << attributes.name);
+
+        // DO NOT REMOVE
+        // input data type is needed for:
+        // - aType of bmm2
+        // - dropout scale in pre 8.9.3
+        attributes.fill_from_context(this->context);
+
+        //// Optional Attn scale
+        // In case user provided a scalar value, do a fused scalar.
+        if (attributes.attn_scale_value.has_value()) {
+            attributes.inputs[input_names::Attn_scale] =
+                std::make_shared<Tensor_attributes>(attributes.attn_scale_value.value());
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        CUDNN_FRONTEND_UNUSED(operations);
+        CUDNN_FE_LOG_LABEL("INFO: " << "Building UnifiedSDPANode operations " << attributes.name << " ");
+        auto cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Unified SDPA node requires cuDNN 9.13.1"};
+
+#if (CUDNN_VERSION >= 91301)
+        NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91301, cudnn_ver_error);
+        auto unified_sdpa_operation =
+            make_shared_backend_pointer((cudnnBackendDescriptorType_t)CUDNN_BACKEND_OPERATION_SDPA_FWD_DESCRIPTOR);
+
+        auto Q         = attributes.inputs.find(SDPA_attributes::input_names::Q)->second;
+        auto backend_q = tensors[Q->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_SDPA_FWD_QDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_q));
+
+        auto K         = attributes.inputs.find(SDPA_attributes::input_names::K)->second;
+        auto backend_k = tensors[K->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_SDPA_FWD_KDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_k));
+
+        auto V         = attributes.inputs.find(SDPA_attributes::input_names::V)->second;
+        auto backend_v = tensors[V->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_SDPA_FWD_VDESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_v));
+
+        auto O         = attributes.outputs.find(SDPA_attributes::output_names::O)->second;
+        auto backend_o = tensors[O->get_uid()]->get_desc()->get_backend_descriptor();
+        _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                       CUDNN_ATTR_OPERATION_SDPA_FWD_ODESC,
+                                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                       1,
+                                                       &backend_o));
+
+        auto stats_it = attributes.outputs.find(SDPA_attributes::output_names::Stats);
+        if (stats_it != attributes.outputs.end()) {
+            auto backend_stats = tensors[stats_it->second->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_SDPA_FWD_STATSDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_stats));
+        }
+
+        auto attn_scale_it = attributes.inputs.find(SDPA_attributes::input_names::Attn_scale);
+        if (attn_scale_it != attributes.inputs.end()) {
+            auto backend_scale = tensors[attn_scale_it->second->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_SDPA_FWD_SCALEDESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_scale));
+        }
+
+        auto block_mask_it = attributes.inputs.find(SDPA_attributes::input_names::Block_mask);
+        if (block_mask_it != attributes.inputs.end() && block_mask_it->second != nullptr) {
+            auto block_mask_cudnn_ver_error =
+                error_t{error_code_t::GRAPH_NOT_SUPPORTED, "Block mask in unified SDPA node requires cuDNN 9.14.0"};
+#if CUDNN_VERSION >= 91400
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91400, block_mask_cudnn_ver_error);
+            auto backend_block_mask = tensors[block_mask_it->second->get_uid()]->get_desc()->get_backend_descriptor();
+            _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                           CUDNN_ATTR_OPERATION_SDPA_FWD_BLOCK_MASK_DESC,
+                                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                           1,
+                                                           &backend_block_mask));
+#else
+            return block_mask_cudnn_ver_error;
+#endif
+        }
+
+        // Paged attention attributes
+        if (is_paged_k() || is_paged_v() || has_seq_len_q() || has_seq_len_kv()) {
+            auto paged_cudnn_ver_error = error_t{error_code_t::GRAPH_NOT_SUPPORTED,
+                                                 "Paged attention in unified SDPA node requires cuDNN 9.15.0"};
+#if (CUDNN_VERSION >= 91500)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91500, paged_cudnn_ver_error);
+
+            if (is_paged_k()) {
+                auto page_table_K         = attributes.inputs.find(SDPA_attributes::input_names::Page_table_K)->second;
+                auto backend_page_table_K = tensors[page_table_K->get_uid()]->get_desc()->get_backend_descriptor();
+                _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                               CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_KDESC,
+                                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                               1,
+                                                               &backend_page_table_K));
+            }
+
+            if (is_paged_v()) {
+                auto page_table_V         = attributes.inputs.find(SDPA_attributes::input_names::Page_table_V)->second;
+                auto backend_page_table_V = tensors[page_table_V->get_uid()]->get_desc()->get_backend_descriptor();
+                _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                               CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_VDESC,
+                                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                               1,
+                                                               &backend_page_table_V));
+            }
+
+            if (has_seq_len_q()) {
+                auto seq_len_Q         = attributes.inputs.find(SDPA_attributes::input_names::SEQ_LEN_Q)->second;
+                auto backend_seq_len_Q = tensors[seq_len_Q->get_uid()]->get_desc()->get_backend_descriptor();
+                _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                               CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_QDESC,
+                                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                               1,
+                                                               &backend_seq_len_Q));
+            }
+
+            if (has_seq_len_kv()) {
+                auto seq_len_KV         = attributes.inputs.find(SDPA_attributes::input_names::SEQ_LEN_KV)->second;
+                auto backend_seq_len_KV = tensors[seq_len_KV->get_uid()]->get_desc()->get_backend_descriptor();
+                _CUDNN_CHECK_CUDNN_ERROR(detail::set_attribute(unified_sdpa_operation->get_backend_descriptor(),
+                                                               CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC,
+                                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                               1,
+                                                               &backend_seq_len_KV));
+            }
+
+            // Ignore attributes.max_seq_len_kv, because unified engine doesn't need it (it's harmless if set).
+
+            // Ignore attributes.padding_mask, because unified engine already applies an implicit padding mask
+            // if seq_len_Q and seq_len_KV are both provided. We already checked in
+            // `SDPA_attributes::validate_sdpa_support_surface()` that padding_mask must be true if and
+            // only if seq_len_Q and seq_len_KV are both set, so we don't need to check it here.
+#else
+            return paged_cudnn_ver_error;
+#endif
+        }
+
+        _CUDNN_CHECK_CUDNN_ERROR(detail::finalize(unified_sdpa_operation->get_backend_descriptor()));
+
+        raw_operations.push_back(unified_sdpa_operation);
+
+        auto const& non_virtual_uids = attributes.get_non_virtual_uids();
+        uids_involved_in_operations.insert(non_virtual_uids.begin(), non_virtual_uids.end());
+        return {error_code_t::OK, ""};
+#else
+        CUDNN_FRONTEND_UNUSED(uids_involved_in_operations);
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        CUDNN_FRONTEND_UNUSED(tensors);
+        return cudnn_ver_error;
+#endif  // CUDNN_VERSION >= 91301
+    }
+};
+
+}  // namespace cudnn_frontend::graph
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/sdpa_fp8_bwd.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/sdpa_fp8_bwd.h
new file mode 100644
index 00000000..225ecf1f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/sdpa_fp8_bwd.h
@@ -0,0 +1,649 @@
+#pragma once
+
+#include "../../cudnn_frontend_Heuristics.h"
+#include "../../cudnn_frontend_Logging.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+#include "matmul_fp8.h"
+#include "pointwise.h"
+#include "reduction.h"
+#include "softmax.h"
+
+namespace cudnn_frontend::graph {
+
+class SDPAFP8BackwardNode : public NodeCRTP<SDPAFP8BackwardNode> {
+    using input_names  = SDPA_fp8_backward_attributes::input_names;
+    using output_names = SDPA_fp8_backward_attributes::output_names;
+
+   private:
+    mutable bool is_deterministic_algorithm_supported_on_blackwell = false;  // Will be edited in pre_validate_node()
+
+   public:
+    SDPA_fp8_backward_attributes attributes;
+
+    SDPAFP8BackwardNode(SDPA_fp8_backward_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::COMPOSITE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating SDPAFP8BackwardNode " << attributes.name);
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90100,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "sdpa fp8 backward operation is only supported starting cudnn 9.1.0. Please "
+                                       "consider upgrading your current version.");
+
+        cudaDeviceProp prop;
+        int device;
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device(&device));
+        _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device_properties(&prop, device));
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            prop.major < 9,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "sdpa fp8 forward operation is only supported on Hopper architecture and newer. Please "
+            "consider using a newer architecture.");
+
+        // check that Q, K, V, O, stats, dO, dQ, dK, dV tensors has been assigned
+        // check that dim and strides has been assigned and last stride is 1
+#define CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(port, port_map)                                                       \
+    {                                                                                                           \
+        std::shared_ptr<Tensor_attributes> tensor_ptr = port_map.at(port);                                      \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(tensor_ptr->get_dim().size() != 4,                                       \
+                                       error_code_t::ATTRIBUTE_NOT_SET,                                         \
+                                       "The dim for " + std::string(#port) + " is invalid");                    \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(tensor_ptr->get_stride().size() != 4,                                    \
+                                       error_code_t::ATTRIBUTE_NOT_SET,                                         \
+                                       "The stride for " + std::string(#port) + " is invalid");                 \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(                                                                         \
+            tensor_ptr->get_stride()[3] != 1,                                                                   \
+            error_code_t::GRAPH_NOT_SUPPORTED,                                                                  \
+            "The stride for the last dimension corresponding to the embedding size per head should be 1 for " + \
+                std::string(#port));                                                                            \
+    }
+
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::Q, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::K, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::V, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::O, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::Stats, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(input_names::dO, attributes.inputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::dQ, attributes.outputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::dK, attributes.outputs);
+        CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE(output_names::dV, attributes.outputs);
+
+#undef CUDNN_FE_SDPA_VALIDATE_DIM_STRIDE
+
+        // validate backend limitations for the operation
+        // clang-format off
+        int64_t s_q  = attributes.inputs.at(input_names::Q)->get_dim()[2];
+        int64_t s_kv = attributes.inputs.at(input_names::K)->get_dim()[2];
+        int64_t h_q  = attributes.inputs.at(input_names::Q)->get_dim()[1];
+        int64_t h_k  = attributes.inputs.at(input_names::K)->get_dim()[1];
+        int64_t h_v  = attributes.inputs.at(input_names::V)->get_dim()[1];
+        int64_t d_qk = attributes.inputs.at(input_names::Q)->get_dim()[3];
+        int64_t d_v  = attributes.inputs.at(input_names::V)->get_dim()[3];
+
+        auto const& dq_tensor = attributes.outputs.at(output_names::dQ);
+        auto const& dq_data_type = dq_tensor->get_data_type();
+        auto const& dk_tensor = attributes.outputs.at(output_names::dK);
+        auto const& dk_data_type = dk_tensor->get_data_type();
+        auto const& dv_tensor = attributes.outputs.at(output_names::dV);
+        auto const& dv_data_type = dv_tensor->get_data_type();
+
+        auto const& bias_mask = attributes.inputs.find(input_names::Bias);
+        bool const is_bias    = (bias_mask != attributes.inputs.end() && bias_mask->second != nullptr);
+
+        auto const& dropout_mask     = attributes.inputs.find(input_names::Dropout_mask);
+        bool const is_dropout_custom = (dropout_mask != attributes.inputs.end()) && (dropout_mask->second != nullptr);
+        bool const is_dropout        = attributes.dropout_probability.has_value();
+
+        // validation TODO:
+        //    - validate stats has valid dims
+
+        // validate basic dimension requirements
+        if(prop.major >= 10) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(((d_qk > 128) || (d_qk % 16 != 0)) && !(d_qk == 192 && d_v == 128),
+                                            error_code_t::GRAPH_NOT_SUPPORTED,
+                                            "hidden_dim d_qk shoud be less than or equal to 128 and hidden_dim d_qk should be multiple of 16 unless d_qk == 192 and d_v == 128");
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(((d_v > 128) || (d_v % 16 != 0)),
+                                            error_code_t::GRAPH_NOT_SUPPORTED,
+                                            "hidden_dim d_v shoud be less than or equal to 128 and hidden_dim d_v should be multiple of 16");
+        }
+        else {
+            RETURN_CUDNN_FRONTEND_ERROR_IF((d_qk != 128) || (d_qk % 16 != 0) || (d_v != 128) || (d_v % 16 != 0),
+                                error_code_t::GRAPH_NOT_SUPPORTED,
+                                "hidden_dim shoud be equal to 128 and hidden_dim should be multiple of 16");
+        }
+        RETURN_CUDNN_FRONTEND_ERROR_IF((h_q % h_k != 0) || (h_q % h_v != 0),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For group-query attention, number of heads for key and query must be a factor of number of heads for query");
+
+        // validate options for attn_scale
+        auto const& attn_scale    = attributes.inputs.find(input_names::Attn_scale);
+        bool const has_attn_scale = (attn_scale != attributes.inputs.end()) && (attn_scale->second != nullptr);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(has_attn_scale && attributes.attn_scale_value.has_value(),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "attn_scale with tensor and value cannot be set at the same time.");
+
+        // validate options for bias mask
+        RETURN_CUDNN_FRONTEND_ERROR_IF(is_bias && (bias_mask->second->get_data_type() == DataType_t::BOOLEAN),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Bias mask data type cannot be boolean");
+
+        // validate options for padding mask
+        auto const& seq_len_q     = attributes.inputs.find(input_names::SEQ_LEN_Q);
+        bool const has_seq_len_q  = (seq_len_q != attributes.inputs.end()) && (seq_len_q->second != nullptr);
+        auto const& seq_len_kv    = attributes.inputs.find(input_names::SEQ_LEN_KV);
+        bool const has_seq_len_kv = (seq_len_kv != attributes.inputs.end()) && (seq_len_kv->second != nullptr);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.padding_mask && (!has_seq_len_q || !has_seq_len_kv),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Padding mask requires seq_len_q and seq_len_kv to be set.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF((!attributes.padding_mask) && (has_seq_len_q || has_seq_len_kv),
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "seq_len_q and seq_len_kv needs to be set only if padding mask is enabled.");
+
+        // validate options for dropout mask
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.dropout_probability.has_value() && is_dropout_custom,
+            error_code_t::ATTRIBUTE_NOT_SET,
+            "Using both, custom dropout mask and internal-mask generation using dropout probability, is ill-formed.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            attributes.dropout_probability.has_value() && attributes.dropout_probability.value() == 1.0,
+            error_code_t::ATTRIBUTE_NOT_SET,
+            "Dropout probability cannot be 1 as corresponding scale wont be well formed.");
+
+
+        // Validate options for causal_mask_bottom_right
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.causal_mask_bottom_right && detail::get_backend_version() < 90700,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "For cuDNN version below 9.7.0, bottom right causal masking is not supported.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.causal_mask_bottom_right && prop.major < 10, 
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "sdpa fp8 forward operation is only supported on Blackwell architecture and newer. Please "
+            "consider using a newer architecture.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.causal_mask && attributes.causal_mask_bottom_right,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Bottom right causal mask and causal mask cannot be both enabled");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.causal_mask_bottom_right && s_q > s_kv,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Bottom right causal mask does not support s_q > s_kv. Please virtually slice the Q tensor and pass it as s_q == s_kv");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.causal_mask_bottom_right && (is_bias || is_dropout),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Bottom right causal mask is only supported with is_bias=False, is_dropout=False.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(attributes.causal_mask_bottom_right && ((s_q % 64 != 0) || (s_kv % 64 != 0)),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "Bottom right causal mask is only supported with s_q multiple of 64, and s_kv multiple of 64");
+
+        // validate that datatype is set for the graph
+        RETURN_CUDNN_FRONTEND_ERROR_IF(context.get_intermediate_data_type() == DataType_t::NOT_SET,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Intermediate tensor data type needs to be set as internal tensors require it.");
+
+        // validate options for deterministic algorithm
+        if (attributes.is_deterministic_algorithm && (prop.major == 10)) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF((detail::get_backend_version() < 91900),
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "FP8 deterministic algorithm is not supported on blackwell architecture with cudnn version below 9.19.0");
+
+            // dbias bias rng/dropout alibi
+            RETURN_CUDNN_FRONTEND_ERROR_IF(is_dropout,
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "FP8 deterministic algorithm is not supported on blackwell architecture when dropout is enabled");
+
+            is_deterministic_algorithm_supported_on_blackwell = true;
+        }
+
+        // if output data type is half or bfloat16 for any of dq, dk, dv, and version is below 9.13 or is not blackwell, return NOT_SUPPORTED
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (dq_data_type == DataType_t::HALF || dq_data_type == DataType_t::BFLOAT16 ||
+             dk_data_type == DataType_t::HALF || dk_data_type == DataType_t::BFLOAT16 ||
+             dv_data_type == DataType_t::HALF || dv_data_type == DataType_t::BFLOAT16) &&
+                (detail::get_backend_version() < 91300 || prop.major < 10),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "sdpa fp8 forward operation is only supported on cuDNN version 9.13.0 and newer. Please "
+            "consider upgrading your current version.");
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    expand_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for Scaled_dot_product_flash_attention node  "
+                                << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // Gather dim to fill properties of virtual tensors
+        auto const& q_dim = attributes.inputs[input_names::Q]->get_dim();
+        auto b            = q_dim[0];
+        auto h_q          = q_dim[1];
+        auto s_q          = q_dim[2];
+        // auto d_qk         = q_dim[3];
+        auto const& k_dim = attributes.inputs[input_names::K]->get_dim();
+        // auto h_k          = k_dim[1];
+        auto s_kv = k_dim[2];
+        // auto const& v_dim = attributes.inputs[input_names::V]->get_dim();
+        // auto h_v          = v_dim[1];
+        // auto d_v          = v_dim[3];
+
+        // cuDNN frontend API attention requires Q, K, V where
+        // Q = {b, h_q, s_q, d_qk}
+        // K = {b, h_k, s_kv, d_qk}
+        // V = {b, h_v, s_kv, d_v}
+        // but cuDNN backend API attention requires Q, KT, VT
+        // Q = {b, h_q, s_q, d_qk}
+        // KT = {b, h_k, d_qk, s_kv}
+        // VT = {b, h_v, d_v, s_kv}
+        // So the code below maps the K->KT and V->VT
+        std::vector<int64_t> temp_vec;
+
+        temp_vec = attributes.inputs[input_names::K]->get_dim();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::K]->set_dim(temp_vec);
+
+        temp_vec = attributes.inputs[input_names::K]->get_stride();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::K]->set_stride(temp_vec);
+
+        temp_vec = attributes.inputs[input_names::V]->get_dim();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::V]->set_dim(temp_vec);
+
+        temp_vec = attributes.inputs[input_names::V]->get_stride();
+        std::swap(temp_vec[2], temp_vec[3]);
+        attributes.inputs[input_names::V]->set_stride(temp_vec);
+
+        std::shared_ptr<Tensor_attributes> rng_output;
+
+        auto mul_attributes = Pointwise_attributes().set_mode(PointwiseMode_t::MUL);
+
+        // if dropout_prob is used, then the node passes scale and scale inverse
+        // if dropout_mask is used, then the user passes scale and scale_inverse
+        bool is_dropout_prob = (attributes.dropout_probability.has_value());
+        bool is_dropout_mask = (attributes.inputs[input_names::Dropout_mask] != nullptr);
+        if (is_dropout_prob) {
+            float dropout_scale_value     = 1.0f / (1.0f - attributes.dropout_probability.value());
+            float dropout_scale_inv_value = (1.0f - attributes.dropout_probability.value());
+
+            attributes.inputs[input_names::Dropout_scale] = std::make_shared<Tensor_attributes>(dropout_scale_value);
+            attributes.inputs[input_names::Dropout_scale_inv] =
+                std::make_shared<Tensor_attributes>(dropout_scale_inv_value);
+        }
+
+        // --------------RNG node--------------------
+
+        if (is_dropout_prob) {
+                rng_output = rng(attributes.inputs[input_names::Seed],
+                                 attributes.inputs[input_names::Offset],
+                                 Rng_attributes()
+                                     .set_name("rng")
+                                     .set_distribution(RngDistribution_t::BERNOULLI)
+                                     .set_bernoulli_probability(1.0f - attributes.dropout_probability.value()));
+                rng_output->set_dim({b, h_q, s_q, s_kv}).set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1});
+        } else if (is_dropout_mask) {
+            rng_output = attributes.inputs[input_names::Dropout_mask];
+        }
+
+        //// dO * O
+        mul_attributes.set_name("mul_dO_O");
+        auto last_output =
+            pointwise(attributes.inputs[input_names::dO], attributes.inputs[input_names::O], mul_attributes);
+
+        // reduce(dO)
+        last_output =
+            reduction(last_output, Reduction_attributes().set_name("reduce_dO").set_mode(ReductionMode_t::ADD));
+        last_output->set_dim({b, h_q, s_q, 1}).set_stride({h_q * s_q, s_q, 1, 1});
+
+        // Descale dO
+        mul_attributes.set_name("descale_dO");
+        last_output = pointwise(last_output, attributes.inputs.at(input_names::Descale_dO), mul_attributes);
+        last_output->set_dim({b, h_q, s_q, 1}).set_stride({h_q * s_q, s_q, 1, 1});
+
+        // Descale O
+        mul_attributes.set_name("descale_O");
+        last_output = pointwise(last_output, attributes.inputs.at(input_names::Descale_O), mul_attributes);
+
+        // softmax_sum = last_output * dropout_scale
+        if(attributes.inputs[input_names::Dropout_scale_inv]) {
+            last_output = pointwise(last_output,
+                                    attributes.inputs[input_names::Dropout_scale_inv],
+                                    Pointwise_attributes().set_name("scale_dropout_inv").set_mode(PointwiseMode_t::MUL));
+        }
+        auto softmax_sum = last_output;
+
+        //// Q * K
+        auto bmm_Q_K_attributes = Matmul_attributes().set_name("bmm_Q_K")
+                                 .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                                 .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV]);
+        auto last_dV = matmul(attributes.inputs[input_names::Q], attributes.inputs[input_names::K], bmm_Q_K_attributes);
+
+        //// Optional Attn scale
+        // In case user provided a scalar value, do a fused scalar.
+        if (attributes.attn_scale_value.has_value()) {
+            attributes.inputs[input_names::Attn_scale] =
+                std::make_shared<Tensor_attributes>(attributes.attn_scale_value.value());
+        }
+
+        // If attn scale present, add a pointwise mul node
+        if (auto attn_scale_it = attributes.inputs.find(input_names::Attn_scale); attn_scale_it != attributes.inputs.end()) {
+            mul_attributes.set_name("attn_scale");
+            last_dV = pointwise(last_dV, attn_scale_it->second, mul_attributes);
+        }
+
+        //// Descales
+        // Descale Q
+        mul_attributes.set_name("descale_q");
+        last_dV = pointwise(last_dV, attributes.inputs.at(input_names::Descale_Q), mul_attributes);
+
+        // Descale K
+        mul_attributes.set_name("descale_k");
+        last_dV = pointwise(last_dV, attributes.inputs.at(input_names::Descale_K), mul_attributes);
+
+        // (optional) last_dV = last_dV + bias
+        if (auto bias_it = attributes.inputs.find(input_names::Bias); bias_it != attributes.inputs.end()) {
+            last_dV = pointwise(last_dV,
+                                    bias_it->second,
+                                    Pointwise_attributes().set_name("add_bias").set_mode(PointwiseMode_t::ADD));
+        }
+
+        // (optional) Apply padding mask
+        if (attributes.padding_mask) {
+            auto row_idx_output = pointwise(last_dV,
+                                            Pointwise_attributes()
+                                                .set_name("gen_row_idx_padding")
+                                                .set_mode(PointwiseMode_t::GEN_INDEX)
+                                                .set_axis(2)
+                                                .set_compute_data_type(DataType_t::INT32));
+            row_idx_output->set_data_type(DataType_t::INT32);
+
+            auto col_idx_output = pointwise(last_dV,
+                                            Pointwise_attributes()
+                                                .set_name("gen_col_idx_padding")
+                                                .set_mode(PointwiseMode_t::GEN_INDEX)
+                                                .set_axis(3)
+                                                .set_compute_data_type(DataType_t::INT32));
+            col_idx_output->set_data_type(DataType_t::INT32);
+
+            auto row_mask_output = pointwise(row_idx_output,
+                                             attributes.inputs[input_names::SEQ_LEN_Q],
+                                             Pointwise_attributes()
+                                                 .set_name("lt_row_sq_padding")
+                                                 .set_mode(PointwiseMode_t::CMP_LT)
+                                                 .set_compute_data_type(DataType_t::BOOLEAN));
+            row_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+            auto col_mask_output = pointwise(col_idx_output,
+                                             attributes.inputs[input_names::SEQ_LEN_KV],
+                                             Pointwise_attributes()
+                                                 .set_name("lt_col_skv_padding")
+                                                 .set_mode(PointwiseMode_t::CMP_LT)
+                                                 .set_compute_data_type(DataType_t::BOOLEAN));
+            col_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+            auto padding_mask_output = pointwise(row_mask_output,
+                                                 col_mask_output,
+                                                 Pointwise_attributes()
+                                                     .set_name("and_row_col_padding")
+                                                     .set_mode(PointwiseMode_t::LOGICAL_AND)
+                                                     .set_compute_data_type(DataType_t::BOOLEAN));
+            padding_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+            // Use a smaller value of neg infinity so that the softmax stats for rows that are fully padded dont
+            // go towards NaNs/Infs when multipled by the numerous scale/descale
+            auto negative_inf_padding = std::make_shared<Tensor_attributes>(attn::score_modifiers::get_negative_inf_value());
+
+            last_dV =
+                pointwise(last_dV,
+                          negative_inf_padding,
+                          padding_mask_output,
+                          Pointwise_attributes().set_name("select_padding").set_mode(PointwiseMode_t::BINARY_SELECT));
+        }
+
+        //// Optional causal masking
+        if (attributes.causal_mask) {
+            auto row_index_attributes =
+                Pointwise_attributes().set_name("gen_row_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(2);
+            std::shared_ptr<Tensor_attributes> row_index_output = pointwise(last_dV, row_index_attributes);
+            row_index_output->set_data_type(DataType_t::INT32);
+
+            auto col_index_attributes =
+                Pointwise_attributes().set_name("gen_col_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(3);
+            auto const& col_index_output = pointwise(last_dV, col_index_attributes);
+            col_index_output->set_data_type(DataType_t::INT32);
+
+            if (attributes.causal_mask_bottom_right) {
+                if (attributes.inputs[input_names::SEQ_LEN_KV]) {
+                    row_index_output = pointwise(row_index_output,
+                                          attributes.inputs[input_names::SEQ_LEN_KV],
+                                          Pointwise_attributes()
+                                              .set_name("row_idx_add_skv")
+                                              .set_mode(PointwiseMode_t::ADD)
+                                              .set_compute_data_type(DataType_t::INT32));
+                } else {
+                    row_index_output = pointwise(row_index_output,
+                                          std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_kv)),
+                                          Pointwise_attributes()
+                                              .set_name("row_idx_add_skv")
+                                              .set_mode(PointwiseMode_t::ADD)
+                                              .set_compute_data_type(DataType_t::INT32));
+                }
+                row_index_output->set_data_type(DataType_t::INT32);
+
+                if (attributes.inputs[input_names::SEQ_LEN_Q]) {
+                    row_index_output = pointwise(row_index_output,
+                                          attributes.inputs[input_names::SEQ_LEN_Q],
+                                          Pointwise_attributes()
+                                              .set_name("row_idx_add_sq_sub_sq")
+                                              .set_mode(PointwiseMode_t::SUB)
+                                              .set_compute_data_type(DataType_t::INT32));
+                } else {
+                    row_index_output = pointwise(row_index_output,
+                                          std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_q)),
+                                          Pointwise_attributes()
+                                              .set_name("row_idx_add_sq_sub_sq")
+                                              .set_mode(PointwiseMode_t::SUB)
+                                              .set_compute_data_type(DataType_t::INT32));
+                }
+                row_index_output->set_data_type(DataType_t::INT32);
+            }
+
+            auto greater_than_attributes = Pointwise_attributes()
+                                               .set_name("row_greater_than_col")
+                                               .set_mode(PointwiseMode_t::CMP_GE)
+                                               .set_compute_data_type(DataType_t::BOOLEAN);
+            auto const& row_greater_than_col_output =
+                pointwise(row_index_output, col_index_output, greater_than_attributes);
+            row_greater_than_col_output->set_data_type(DataType_t::BOOLEAN);
+
+            // Lower attributes to binary select attributes
+            auto negative_inf_causal = std::make_shared<Tensor_attributes>(attn::score_modifiers::get_negative_inf_value());
+
+            auto binary_select_attributes =
+                Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT);
+            last_dV = pointwise(last_dV, negative_inf_causal, row_greater_than_col_output, binary_select_attributes);
+        }
+
+        //// Apply Softmax
+        // last_dV = last_dV - stats
+        last_dV = pointwise(last_dV,
+                            attributes.inputs[input_names::Stats],
+                            Pointwise_attributes().set_name("sub_dV_Stats").set_mode(PointwiseMode_t::SUB));
+
+        // last_dV = exp(last_dV)
+        last_dV    = pointwise(last_dV, Pointwise_attributes().set_name("exp_dV").set_mode(PointwiseMode_t::EXP));
+        auto exp_S = last_dV;
+
+        // (optional) last_dV = last_dV * dropout rng_output
+        if (is_dropout_prob || is_dropout_mask) {
+            last_dV =
+                pointwise(last_dV,
+                          rng_output,
+                          Pointwise_attributes().set_name("mul_p_dropout_mask").set_mode(PointwiseMode_t::MUL));
+        }
+
+        // (optional) last_dV = last_dV * dropout_scale
+        if (attributes.inputs[input_names::Dropout_scale]) {
+            last_dV =
+                pointwise(last_dV,
+                          attributes.inputs[input_names::Dropout_scale],
+                          Pointwise_attributes().set_name("mul_dS_dropout_scale").set_mode(PointwiseMode_t::MUL));
+        }
+
+        // Scale S
+        mul_attributes.set_name("scale_S");
+        last_dV = pointwise(last_dV, attributes.inputs.at(input_names::Scale_S), mul_attributes);
+        last_dV->set_data_type(attributes.inputs.at(input_names::Q)->get_data_type());
+
+        // Reshape S
+        last_dV = reshape(last_dV, Reshape_attributes().set_name("S_transpose"));
+        last_dV->set_name("S_T").set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv});
+        last_dV->set_data_type(attributes.inputs[input_names::Q]->get_data_type());
+
+        //// S_T * dO
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        matmul_fp8(last_dV,
+                   attributes.inputs[input_names::dO],
+                   attributes.inputs[input_names::Descale_S],
+                   attributes.inputs[input_names::Descale_dO],
+                   attributes.inputs[input_names::Scale_dV],
+                   Matmul_fp8_attributes().set_name("bmm_S_T_dO")
+                       .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
+                       .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]),
+                   attributes.outputs[output_names::dV],
+                   attributes.outputs[output_names::Amax_dV]);
+
+        //// dO * V_T
+        auto bmm_dO_V_T_attributes = Matmul_attributes().set_name("bmm_dO_V_T")
+                                 .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                                 .set_n_override(attributes.inputs[input_names::SEQ_LEN_KV]);
+        last_output =
+            matmul(attributes.inputs[input_names::dO], attributes.inputs[input_names::V], bmm_dO_V_T_attributes);
+
+        //// Descales
+        // Descale dO
+        mul_attributes.set_name("descale_dO");
+        last_output = pointwise(last_output, attributes.inputs.at(input_names::Descale_dO), mul_attributes);
+
+        // Descale V
+        mul_attributes.set_name("descale_V");
+        last_output = pointwise(last_output, attributes.inputs.at(input_names::Descale_V), mul_attributes);
+
+        // dP = last_output - softmax_sum
+        auto dP = pointwise(last_output,
+                            softmax_sum,
+                            Pointwise_attributes().set_name("sub_dP_softmax_sum").set_mode(PointwiseMode_t::SUB));
+
+        // dP = dP * exp_S
+        mul_attributes.set_name("mul_dP_exp_S");
+        dP = pointwise(dP, exp_S, mul_attributes);
+
+        // (optional) dP = dP * dropout_scale
+        if (attributes.inputs[input_names::Dropout_scale]) {
+            dP =
+                pointwise(dP,
+                          attributes.inputs[input_names::Dropout_scale],
+                          Pointwise_attributes().set_name("mul_dS_dropout_scale").set_mode(PointwiseMode_t::MUL));
+        }
+
+        // if (attributes.outputs[output_names::dBias]) {
+        //     reduction(dP,
+        //               Reduction_attributes().set_name("red_dP_dBias").set_mode(ReductionMode_t::ADD),
+        //               attributes.outputs[output_names::dBias]);
+        // }
+
+        // (optional) dP = dP * attn_scale
+        if (auto attn_scale_it = attributes.inputs.find(input_names::Attn_scale); attn_scale_it != attributes.inputs.end()) {
+            mul_attributes.set_name("mul_dS_attn_scale");
+            dP = pointwise(dP, attn_scale_it->second, mul_attributes);
+        }
+
+        // Amax dP
+        auto amax_attributes = Reduction_attributes().set_name("amax_dP").set_mode(ReductionMode_t::AMAX);
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        reduction(dP, amax_attributes, attributes.outputs.at(output_names::Amax_dP));
+
+        // Scale dP
+        mul_attributes.set_name("scale_dP");
+        dP = pointwise(dP, attributes.inputs.at(input_names::Scale_dP), mul_attributes);
+        dP->set_data_type(attributes.inputs.at(input_names::dO)->get_data_type());
+
+        //// dP * K
+        auto const& kt_dim    = attributes.inputs[input_names::K]->get_dim();
+        auto const& kt_stride = attributes.inputs[input_names::K]->get_stride();
+
+        auto K = reshape(attributes.inputs[input_names::K], Reshape_attributes().set_name("reshape_K"));
+        K->set_dim({kt_dim[0], kt_dim[1], kt_dim[3], kt_dim[2]})
+            .set_stride({kt_stride[0], kt_stride[1], kt_stride[3], kt_stride[2]});
+
+        auto bmm_dP_K_attributes = Matmul_fp8_attributes().set_name("bmm_dP_K")
+                   .set_m_override(attributes.inputs[input_names::SEQ_LEN_Q])
+                   .set_k_override(attributes.inputs[input_names::SEQ_LEN_KV]);
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        matmul_fp8(dP,
+                   K,
+                   attributes.inputs[input_names::Descale_dP],
+                   attributes.inputs[input_names::Descale_K],
+                   attributes.inputs[input_names::Scale_dQ],
+                   bmm_dP_K_attributes,
+                   attributes.outputs[output_names::dQ],
+                   attributes.outputs[output_names::Amax_dQ]);
+
+        //// dP.T * Q
+        auto dP_T_attributes = Reshape_attributes().set_name("dP_T");
+        auto dP_T            = reshape(dP, dP_T_attributes);
+        dP_T->set_data_type(attributes.inputs.at(input_names::dO)->get_data_type());
+        dP_T->set_name("dP_T").set_dim({b, h_q, s_kv, s_q}).set_stride({h_q * s_q * s_kv, s_q * s_kv, 1, s_kv});
+
+        auto bmm_dP_T_Q_attributes = Matmul_fp8_attributes().set_name("bmm_dP_T_Q")
+                       .set_m_override(attributes.inputs[input_names::SEQ_LEN_KV])
+                       .set_k_override(attributes.inputs[input_names::SEQ_LEN_Q]);
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        matmul_fp8(dP_T,
+                   attributes.inputs[input_names::Q],
+                   attributes.inputs[input_names::Descale_dP],
+                   attributes.inputs[input_names::Descale_Q],
+                   attributes.inputs[input_names::Scale_dK],
+                   bmm_dP_T_Q_attributes,
+                   attributes.outputs[output_names::dK],
+                   attributes.outputs[output_names::Amax_dK]);
+
+        return {error_code_t::OK, ""};
+    }
+
+    std::pair<int64_t, std::unordered_map<KnobType_t, int64_t>>
+    override_heuristics_query() const {
+        if (is_deterministic_algorithm_supported_on_blackwell) {
+            return {5, {{KnobType_t::KERNEL_CFG, 31}, {KnobType_t::STAGES, 2}}};
+        } else {
+            return {-1, {}};
+        }
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"({"tag": "SDPA_FP8_BWD"})"_json);
+    }
+#endif
+};
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/sdpa_support_surface.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/sdpa_support_surface.h
new file mode 100644
index 00000000..6486943a
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/sdpa_support_surface.h
@@ -0,0 +1,504 @@
+#pragma once
+
+#include <cstdlib>
+#include <unordered_set>
+
+#include "../../cudnn_frontend_Heuristics.h"
+#include "../../cudnn_frontend_Logging.h"
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+namespace cudnn_frontend::graph {
+
+inline error_t
+SDPA_attributes::validate_sdpa_support_surface(const detail::Context& context,
+                                               int64_t s_kv,
+                                               bool is_paged_k,
+                                               bool is_paged_v) const {
+    // Extract dimensions from tensors
+    int64_t s_q = inputs.at(SDPA_attributes::input_names::Q)->get_dim()[2];
+    // s_kv is passed in from the caller
+    int64_t h_q  = inputs.at(SDPA_attributes::input_names::Q)->get_dim()[1];
+    int64_t h_k  = inputs.at(SDPA_attributes::input_names::K)->get_dim()[1];
+    int64_t h_v  = inputs.at(SDPA_attributes::input_names::V)->get_dim()[1];
+    int64_t d_qk = inputs.at(SDPA_attributes::input_names::Q)->get_dim()[3];
+    int64_t d_v  = inputs.at(SDPA_attributes::input_names::V)->get_dim()[3];
+
+    bool const is_ragged = inputs.at(SDPA_attributes::input_names::Q)->get_ragged_offset() ||
+                           inputs.at(SDPA_attributes::input_names::K)->get_ragged_offset() ||
+                           inputs.at(SDPA_attributes::input_names::V)->get_ragged_offset() ||
+                           outputs.at(SDPA_attributes::output_names::O)->get_ragged_offset();
+
+    auto const& output_tensor    = outputs.at(SDPA_attributes::output_names::O);
+    auto const& output_data_type = output_tensor->get_data_type();
+
+    auto const& bias_mask = inputs.find(SDPA_attributes::input_names::Bias);
+    bool const is_bias    = (bias_mask != inputs.end() && bias_mask->second != nullptr);
+
+    auto const& dropout_mask     = inputs.find(SDPA_attributes::input_names::Dropout_mask);
+    bool const is_dropout_custom = (dropout_mask != inputs.end()) && (dropout_mask->second != nullptr);
+    bool const is_dropout        = dropout_probability.has_value() || is_dropout_custom;
+
+    bool const is_paged = is_paged_k || is_paged_v;
+
+    auto const& rng_tensor = outputs.find(SDPA_attributes::output_names::RNG_DUMP);
+    bool const is_rng      = (rng_tensor != outputs.end() && rng_tensor->second != nullptr);
+
+    bool const max_seq_kv_explicit = max_seq_len_kv.has_value();
+
+    auto const& attn_scale    = inputs.find(SDPA_attributes::input_names::Attn_scale);
+    bool const has_attn_scale = (attn_scale != inputs.end()) && (attn_scale->second != nullptr);
+
+    auto const& seq_len_q     = inputs.find(SDPA_attributes::input_names::SEQ_LEN_Q);
+    bool const has_seq_len_q  = (seq_len_q != inputs.end()) && (seq_len_q->second != nullptr);
+    auto const& seq_len_kv    = inputs.find(SDPA_attributes::input_names::SEQ_LEN_KV);
+    bool const has_seq_len_kv = (seq_len_kv != inputs.end()) && (seq_len_kv->second != nullptr);
+
+    // validation TODO:
+    //    - validate stats has valid dims
+
+    // Get device properties
+    cudaDeviceProp prop;
+    int device;
+    _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device(&device));
+    _CUDNN_CHECK_CUDA_ERROR(detail::cuda_get_device_properties(&prop, device));
+
+    // Common FP16 and FP8 validation
+    // validate basic dimension requirements
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        (h_q % h_k != 0) || (h_q % h_v != 0),
+        error_code_t::GRAPH_NOT_SUPPORTED,
+        "For group-query attention, number of heads for key and query must be a factor of number of heads for query");
+
+    // validate options for attn_scale
+    RETURN_CUDNN_FRONTEND_ERROR_IF(has_attn_scale && attn_scale_value.has_value(),
+                                   error_code_t::ATTRIBUTE_NOT_SET,
+                                   "attn_scale with tensor and value cannot be set at the same time.");
+
+    // validate options for bias mask
+    RETURN_CUDNN_FRONTEND_ERROR_IF(is_bias && (bias_mask->second->get_data_type() == DataType_t::BOOLEAN),
+                                   error_code_t::GRAPH_NOT_SUPPORTED,
+                                   "Bias mask data type cannot be boolean");
+    RETURN_CUDNN_FRONTEND_ERROR_IF(is_bias && detail::get_backend_version() < 8906,
+                                   error_code_t::GRAPH_NOT_SUPPORTED,
+                                   "Bias mask is not supported below cudnn version 8.9.6");
+
+    RETURN_CUDNN_FRONTEND_ERROR_IF((detail::get_backend_version() >= 8906 && detail::get_backend_version() < 90000) &&
+                                       (context.get_sm_version() > 0 && context.get_sm_version() < 90),
+                                   error_code_t::GRAPH_NOT_SUPPORTED,
+                                   "Post scale Bias mask is not supported below Hopper for cudnn version" +
+                                       std::to_string(detail::get_backend_version()));
+
+    // validate options for padding mask
+    RETURN_CUDNN_FRONTEND_ERROR_IF(padding_mask && (!has_seq_len_q || !has_seq_len_kv),
+                                   error_code_t::ATTRIBUTE_NOT_SET,
+                                   "Padding mask requires seq_len_q and seq_len_kv to be set.");
+    RETURN_CUDNN_FRONTEND_ERROR_IF((!padding_mask && !attention_score_modifier) && (has_seq_len_q || has_seq_len_kv),
+                                   error_code_t::ATTRIBUTE_NOT_SET,
+                                   "seq_len_q and seq_len_kv needs to be set only if padding mask is enabled.");
+
+    RETURN_CUDNN_FRONTEND_ERROR_IF(is_ragged && ((padding_mask == false) && (attention_score_modifier == nullptr)),
+                                   error_code_t::GRAPH_NOT_SUPPORTED,
+                                   "Ragged offsets are only supported with padding mask.");
+
+    // validate options for dropout mask
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        dropout_probability.has_value() && is_dropout_custom,
+        error_code_t::ATTRIBUTE_NOT_SET,
+        "Using both, custom dropout mask and internal-mask generation using dropout probability, is ill-formed.");
+
+    RETURN_CUDNN_FRONTEND_ERROR_IF(dropout_probability.has_value() && dropout_probability.value() == 1.0,
+                                   error_code_t::ATTRIBUTE_NOT_SET,
+                                   "Dropout probability cannot be 1 as corresponding scale wont be well formed.");
+
+    // validate options for causal mask and bottom right causal mask
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        (padding_mask || alibi_mask || has_causal_mask_bottom_right()) && (detail::get_backend_version() < 8906),
+        error_code_t::GRAPH_NOT_SUPPORTED,
+        "Only causal mask is supported in cudnn versions below 8.9.6");
+
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        has_causal_mask_bottom_right() && (!padding_mask) && s_q > s_kv,
+        error_code_t::GRAPH_NOT_SUPPORTED,
+        "Bottom right causal mask does not support max_s_q > max_s_kv. Please virtually slice the Q tensor and pass it "
+        "as max_s_q == max_s_kv");
+
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        has_causal_mask_bottom_right() && (is_bias || alibi_mask || is_dropout),
+        error_code_t::GRAPH_NOT_SUPPORTED,
+        "Bottom right causal mask is only supported with is_bias=False, is_alibi=False, is_dropout=False.");
+
+    RETURN_CUDNN_FRONTEND_ERROR_IF(has_causal_mask_bottom_right() && (detail::get_backend_version() < 90600) &&
+                                       ((s_q % 64 != 0) || (s_kv % 64 != 0)),
+                                   error_code_t::GRAPH_NOT_SUPPORTED,
+                                   "Bottom right causal mask is only supported with s_q multiple of 64, and s_kv "
+                                   "multiple of 64, for cudnn version below 9.6.0");
+
+    // validate that datatype is set for the graph
+    RETURN_CUDNN_FRONTEND_ERROR_IF(context.get_intermediate_data_type() == DataType_t::NOT_SET,
+                                   error_code_t::ATTRIBUTE_NOT_SET,
+                                   "Intermediate tensor data type needs to be set as internal tensors require it.");
+
+    if (mma_core_mode == DataType_t::FP8_E4M3 || mma_core_mode == DataType_t::FP8_E5M2) {
+        // FP8 specific validation
+
+        // version specific validation
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90100,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "sdpa fp8 forward operation is only supported starting cudnn 9.1.0. Please "
+                                       "consider upgrading your current version.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() == 91000,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "sdpa fp8 forward operation is not supported on cudnn 9.10.0. Please "
+                                       "consider upgrading your current version.");
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            prop.major < 9,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "sdpa fp8 forward operation is only supported on Hopper architecture and newer. Please "
+            "consider using a newer architecture.");
+
+        // validate basic dimension requirements
+        // d_qk=192 with d_v=128 is only supported starting from cuDNN 9.19
+        bool const d192_v128_supported = (detail::get_backend_version() >= 91900);
+        if (prop.major >= 10) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                ((d_qk > 128) || (d_qk % 16 != 0)) && !(d192_v128_supported && d_qk == 192 && d_v == 128),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "hidden_dim d_qk should be less than or equal to 128 and hidden_dim d_qk "
+                "should be multiple of 16 unless d_qk == 192 and d_v == 128 (requires cuDNN 9.19+)");
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                ((d_v > 128) || (d_v % 16 != 0)),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "hidden_dim d_v should be less than or equal to 128 and hidden_dim d_v should be multiple of 16");
+        } else {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                (d_qk > 256) || (d_qk % 16 != 0) || (d_v > 256) || (d_v % 16 != 0),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "hidden_dim shoud be less than or equal to 256 and hidden_dim should be multiple of 16");
+        }
+
+        // Validate options for causal_mask_bottom_right
+        RETURN_CUDNN_FRONTEND_ERROR_IF(has_causal_mask_bottom_right() && detail::get_backend_version() < 90700,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.7.0, bottom right causal masking is not supported.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            has_causal_mask_bottom_right() && prop.major < 10,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "sdpa fp8 forward operation is only supported on Blackwell architecture and newer. Please "
+            "consider using a newer architecture.");
+
+        // if output data type is half or bfloat16, and version is below 9.13 or is not blackwell, return NOT_SUPPORTED
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (output_data_type == DataType_t::HALF || output_data_type == DataType_t::BFLOAT16) &&
+                (detail::get_backend_version() < 91300 || prop.major < 10),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "sdpa fp8 forward operation is only supported on cuDNN version 9.13.0 and newer. Please "
+            "consider upgrading your current version.");
+    } else if (mma_core_mode == DataType_t::HALF) {
+        // FP16 specific validation
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (attention_score_modifier != nullptr) &&
+                (alibi_mask || has_causal_like_masking() || padding_mask || left_bound.has_value()),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Attention score mod enabled and hence other subgraphs are disabled.");
+
+        // validate basic dimension requirements
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (d_qk % 8 != 0) || (d_v % 8 != 0), error_code_t::GRAPH_NOT_SUPPORTED, "hidden_dim should be multiple of 8");
+
+        // validate alibi requirements
+        RETURN_CUDNN_FRONTEND_ERROR_IF(alibi_mask && !(right_bound.has_value() && right_bound.value() == 0),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "When alibi mask is used, diagonal_band_right_bound needs to be set to 0.");
+
+        // validate options for bottom right causal mask
+        RETURN_CUDNN_FRONTEND_ERROR_IF(has_causal_mask_bottom_right() && (detail::get_backend_version() < 90300),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Causal bottom right masking requires cudnn 9.3.0 and above");
+
+        // Combination of mask and bias
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            (is_bias && (has_causal_like_masking() || padding_mask) && (detail::get_backend_version() < 8906)),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Bias + padding or causal mask is only supported in 8.9.6 and above");
+
+        // validate options for sliding window length
+        RETURN_CUDNN_FRONTEND_ERROR_IF((left_bound.has_value() && detail::get_backend_version() < 90200),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "sliding window is only supported 9.2.0 and above");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            left_bound.has_value() && left_bound.value() <= 0 && detail::get_backend_version() < 91000,
+            error_code_t::INVALID_VALUE,
+            "Left bound (Sliding window length) should be greater than zero when set.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(left_bound.has_value() && (!padding_mask) && s_q > s_kv,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Sliding window attention is only supported with max_s_q <= max_s_kv.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            left_bound.has_value() && (s_q * left_bound.value() == s_kv * left_bound.value()) &&
+                (detail::get_backend_version() <= 90900) && (prop.major == 9) && has_causal_mask_bottom_right(),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "On Hopper architecture, this specific combination of s_q, s_kv, and left_bound + right_bound + bottom "
+            "right diagonal alignment is not supported for backend version 9.9 or below");
+
+        if ((detail::get_backend_version() < 91002)) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                left_bound.has_value() && (!has_causal_like_masking() || is_dropout || is_bias),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "Left and right bounds are only supported with is_dropout=False, is_bias=False. And the diagonal "
+                "alignment must be set.");
+        }
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(right_bound.has_value() && right_bound.value() < 0,
+                                       error_code_t::INVALID_VALUE,
+                                       "Right bound needs to be larger than or equal to zero");
+
+        // Validate options for s_q == 1
+        const bool is_decode_only = (s_q == 1);
+        RETURN_CUDNN_FRONTEND_ERROR_IF(is_decode_only && (prop.major == 10) && (d_qk > 128 || d_v > 128) &&
+                                           (detail::get_backend_version() <= 90900),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "decode only mode, i.e. s_q == 1 not supported for blackwell architecture with "
+                                       "d_qk or d_v > 128 for backend version 9.9 or below");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            is_decode_only && (detail::get_backend_version() <= 90900) && (right_bound.has_value()),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "decode only mode, i.e. s_q == 1, not supported with masking (right_bound is set) for backend version 9.9 "
+            "or below");
+
+        // validate options for paged attention
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            is_paged && (d_qk > 128 || d_v > 128) && detail::get_backend_version() <= 90900,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Paged attention only supported with d_qk and d_v <= 128 for backend version 9.9 or below");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(is_paged && is_ragged && detail::get_backend_version() < 90700,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Paged caches are not supported in combination with ragged offsets.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(is_paged && (!has_seq_len_q || !has_seq_len_kv),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "Paged caches can only be used in combination with padding mask and variable "
+                                       "sequence lengths for both Q and KV.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            !is_paged && max_seq_kv_explicit,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "When not using paged attention, there is no need to explicitly set max kv sequence length.");
+
+        if (max_seq_kv_explicit) {
+            auto max_seq_kv = max_seq_len_kv.value();
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(is_bias && (bias_mask->second->get_dim()[3] != max_seq_kv),
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "Value set through set_paged_attention_max_seq_len_kv is incompatible with "
+                                           "the sequence length of the bias");
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(is_rng && rng_tensor->second->get_dim()[3] != max_seq_kv,
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "Value set through set_paged_attention_max_seq_len_kv is incompatible with "
+                                           "the sequence length of the RNG_DUMP");
+        }
+
+        // Additional validation for paged attention with packed page tables
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            ((is_paged_k && inputs.at(SDPA_attributes::input_names::Page_table_K)->get_ragged_offset()) ||
+             (is_paged_v && inputs.at(SDPA_attributes::input_names::Page_table_V)->get_ragged_offset())) &&
+                detail::get_backend_version() < 91002,
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "Paged attention with packed page tables only supported with cudnn version 9.10.2 and above");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 8903,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "SDPA OP requires cudnn version 8.9.3 and above");
+
+        // If user has set sm_version allow SM specific checks
+        if (context.get_sm_version() > 0) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(80 > context.get_sm_version(),
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "cudnn SDPA operation requires Ampere and above");
+        }
+
+        // (cudnn_runtime_version < 8907 && num_attn_heads == num_gqa_groups FIXME
+
+        // version specific validation
+        if (prop.major == 8) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                detail::get_backend_version() <= 90900 && ((d_qk > 128) || (d_v > 128)),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "head_dim should be less than or equal to 128 for backend version 9.9 or below on ampere architecture");
+        }
+        if (prop.major == 9) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(
+                detail::get_backend_version() <= 90900 && ((d_qk > 256) || (d_v > 256)),
+                error_code_t::GRAPH_NOT_SUPPORTED,
+                "head_dim should be less than or equal to 256 for backend version 9.9 or below on hopper architecture");
+        }
+        if (prop.major == 10) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF((detail::get_backend_version() < 90900) && ((d_qk > 128) || (d_v > 128)),
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "head_dim should be less than or equal to 128 for backend version 9.8 or "
+                                           "below on blackwell architecture");
+        }
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            detail::get_backend_version() < 8906 && ((s_kv % 64 != 0) || (d_qk % 64 != 0)),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "For cuDNN version below 8.9.6, s_kv not a multiple of 64 or d not a multiple of 64 is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 8907 && (s_kv % 64 != 0) && (!(padding_mask)),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 8.9.7, s_kv not a multiple of 64 is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            detail::get_backend_version() < 90000 && ((s_q % 64 != 0) || (s_kv % 64 != 0)) &&
+                (padding_mask || is_dropout),
+            error_code_t::GRAPH_NOT_SUPPORTED,
+            "For cuDNN version below 9.0.0, s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90200 && left_bound.has_value(),
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.2.0, sliding window attention is not supported");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(detail::get_backend_version() < 90500 && is_paged,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "For cuDNN version below 9.5.0, paged caches are not supported");
+
+        if (is_ragged) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF((context.get_sm_version() > 0 && context.get_sm_version() < 90),
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "THD (ragged offset) is only supported in Hopper and above");
+        }
+        // TODO add version check once fixed
+        RETURN_CUDNN_FRONTEND_ERROR_IF(prop.major == 10 && is_rng,
+                                       error_code_t::GRAPH_NOT_SUPPORTED,
+                                       "dropout RNG dump is not supported for Blackwell architecture");
+    } else {
+        RETURN_CUDNN_FRONTEND_ERROR_IF(true, error_code_t::GRAPH_NOT_SUPPORTED, "Unsupported mma core mode");
+    }
+
+    // Check whether the selected implementation supports the requested features.
+    CHECK_CUDNN_FRONTEND_ERROR(verify_sdpa_support_surface_for_implementation(context, implementation));
+
+    return {error_code_t::OK, ""};
+}
+
+// Verify that the underlying implementation supports all the features in these attributes.
+// Unlike `validate_sdpa_support_surface()`, this may be called before validation, so:
+//   * don't assume any particular keys already exist in `inputs` or `outputs`
+//   * don't assume any tensor dims or strides are already set
+// We return error codes directly instead of using `RETURN_CUDNN_FRONTEND_ERROR_IF`
+// to avoid unneeded logging when this function is being called in a non-error-generating
+// situation (e.g. during auto-select of SDPA implementation).
+inline error_t
+SDPA_attributes::verify_sdpa_support_surface_for_implementation(const detail::Context& context,
+                                                                AttentionImplementation_t impl) const {
+    switch (impl) {
+        case AttentionImplementation_t::AUTO:
+            // This function should not be called with AUTO.
+            return {error_code_t::INVALID_VALUE,
+                    "Can't call verify_sdpa_support_surface_for_implementation with impl=AUTO"};
+        case AttentionImplementation_t::COMPOSITE:
+            for (const auto& [key, value] : inputs) {
+                RETURN_CUDNN_FRONTEND_ERROR_IF(key == input_names::Block_mask && value != nullptr,
+                                               error_code_t::GRAPH_NOT_SUPPORTED,
+                                               "Composite SDPA node doesn't support Block_mask input");
+            }
+            break;
+        case AttentionImplementation_t::UNIFIED: {
+            auto effective_cudnn_ver = std::min(detail::get_backend_version(), detail::get_compiled_version());
+            RETURN_CUDNN_FRONTEND_ERROR_IF(effective_cudnn_ver < 91301,
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "Unified SDPA node requires cuDNN 9.13.1");
+
+            RETURN_CUDNN_FRONTEND_ERROR_IF(context.get_dynamic_shape_enabled(),
+                                           error_code_t::GRAPH_NOT_SUPPORTED,
+                                           "Unified SDPA node doesn't yet support dynamic shape");
+
+            // TODO: Provide smarter error messages that provide the required cuDNN version for each input.
+            std::unordered_set<SDPA_attributes::input_names> allowed_input_names{
+                input_names::Q, input_names::K, input_names::V, input_names::Attn_scale};
+            std::string allowed_input_msg =
+                "Unified SDPA node doesn't yet support inputs other than Q, K, V, Attn_scale";
+
+            if (effective_cudnn_ver >= 91400) {
+                allowed_input_names.insert({input_names::Block_mask});
+                allowed_input_msg += ", Block_mask";
+            }
+
+            if (effective_cudnn_ver >= 91500) {
+                allowed_input_names.insert({input_names::Page_table_K,
+                                            input_names::Page_table_V,
+                                            input_names::SEQ_LEN_Q,
+                                            input_names::SEQ_LEN_KV});
+                allowed_input_msg += ", Page_table_K, Page_table_V, SEQ_LEN_Q, SEQ_LEN_KV";
+            }
+
+            for (const auto& [key, value] : inputs) {
+                if (allowed_input_names.find(key) == allowed_input_names.end() && value != nullptr) {
+                    return {error_code_t::GRAPH_NOT_SUPPORTED, allowed_input_msg};
+                }
+            }
+
+            for (const auto& [key, value] : outputs) {
+                if (key != output_names::O && key != output_names::Stats && value != nullptr) {
+                    return {error_code_t::GRAPH_NOT_SUPPORTED,
+                            "Unified SDPA node doesn't yet support outputs other than O and Stats"};
+                }
+            }
+
+            if (alibi_mask) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED, "Unified SDPA node doesn't yet support alibi mask"};
+            }
+
+            if (padding_mask && effective_cudnn_ver < 91500) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED, "Padding mask for unified SDPA node requires cuDNN 9.15.0"};
+            }
+
+            if (left_bound.has_value() || right_bound.has_value()) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED,
+                        "Unified SDPA node doesn't yet support left bound or right bound"};
+            }
+
+            if (diagonal_alignment != DiagonalAlignment_t::TOP_LEFT) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED, "Unified SDPA node doesn't yet support diagonal alignment"};
+            }
+
+            if (dropout_probability.has_value()) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED, "Unified SDPA node doesn't yet support dropout"};
+            }
+
+            // Unified engine in cuDNN < 9.15 can't meaningfully support max sequence length,
+            // while versions >= 9.15 "support" it by ignoring it (unified engine doesn't need it).
+            if (max_seq_len_kv.has_value() && effective_cudnn_ver < 91500) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED,
+                        "Max sequence length for unified SDPA node cannot be set in cuDNN < 9.15.0"};
+            }
+
+            if (attention_score_modifier != nullptr) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED,
+                        "Unified SDPA node doesn't yet support attention score modifier"};
+            }
+
+            if (mma_core_mode != DataType_t::HALF) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED,
+                        "Unified SDPA node doesn't yet support a data type other than fp16"};
+            }
+
+            if ((compute_data_type != DataType_t::NOT_SET && compute_data_type != DataType_t::FLOAT) ||
+                context.get_compute_data_type() != DataType_t::FLOAT) {
+                return {error_code_t::GRAPH_NOT_SUPPORTED,
+                        "Unified SDPA node doesn't yet support compute data type other than float"};
+            }
+        } break;
+    }
+
+    return {error_code_t::OK, ""};
+}
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/slice.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/slice.h
new file mode 100644
index 00000000..e40f5c53
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/slice.h
@@ -0,0 +1,115 @@
+#pragma once
+
+namespace cudnn_frontend::graph {
+
+class SliceNode : public NodeCRTP<SliceNode> {
+   public:
+    Slice_attributes attributes;
+
+    SliceNode(Slice_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::SLICE;
+    }
+
+    error_t
+    infer_properties_node() override final {
+        getLogger() << "[cudnn_frontend] INFO:     Inferrencing properties for slice node " << attributes.name
+                    << std::endl;
+
+        attributes.fill_from_context(context);
+
+        auto output     = attributes.outputs.at(Slice_attributes::output_names::Y);
+        auto output_dim = output->get_dim();
+
+        if (output_dim.empty()) {
+            for (size_t i = 0; i < attributes.slices.size(); ++i) {
+                output_dim.push_back(attributes.slices[i].second - attributes.slices[i].first);
+            }
+            output->set_dim(output_dim);
+        }
+
+        auto const input            = attributes.inputs.at(Slice_attributes::input_names::X);
+        auto const input_data_type  = input->get_data_type();
+        auto const output_data_type = output->get_data_type();
+        if (output_data_type == DataType_t::NOT_SET) {
+            output->set_data_type(input_data_type);
+        } else {
+            RETURN_CUDNN_FRONTEND_ERROR_IF(output_data_type != input_data_type,
+                                           error_code_t::INVALID_VALUE,
+                                           "output and input tensor data types should match for slice operation.");
+        }
+
+        auto const input_stride = input->get_stride();
+        if (output->get_stride().empty()) {
+            // For simple slicing without changing the step, the stride remains the same
+            // std::vector<int64_t> stride_order =
+            //     detail::generate_stride_order_preserving_format(input_stride, output_dim.size());
+            output->set_stride(input_stride);
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_tensors_node(std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                              int64_t& potential_uid,
+                              std::unordered_set<int64_t> const& used_uids) const override final {
+        // Do not make input tensor for backend.
+        // But assign it a uid
+        auto const input = attributes.inputs.at(Slice_attributes::input_names::X);
+        if (input->has_uid() == false) {
+            detail::assign_uid(input.get(), potential_uid, used_uids);
+        }
+
+        auto const output = attributes.outputs.at(Slice_attributes::output_names::Y);
+        output->set_is_virtual(false);
+        CHECK_CUDNN_FRONTEND_ERROR(detail::create_cudnn_tensor(output, tensors, potential_uid, used_uids));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operations,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>&,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>&) const override final {
+        CUDNN_FRONTEND_UNUSED(raw_operations);
+        // No corresponding backend operation
+
+        auto const virutal_output = attributes.outputs.at(Slice_attributes::output_names::Y);
+        if (virutal_output && virutal_output->get_is_virtual() == false) {
+            uids_involved_in_operations.insert(virutal_output->get_uid());
+            if (auto ragged_offset = virutal_output->get_ragged_offset()) {
+                uids_involved_in_operations.insert(ragged_offset->get_uid());
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    collect_variant_pack_replacements_node(
+        std::unordered_map<Tensor_attributes::uid_t, std::pair<Tensor_attributes::uid_t, int64_t>>&
+            variant_pack_replacements) const override final {
+        auto const input  = attributes.inputs.at(Slice_attributes::input_names::X);
+        auto const output = attributes.outputs.at(Slice_attributes::output_names::Y);
+
+        variant_pack_replacements[input->get_uid()] = {output->get_uid(), attributes.get_offset()};
+
+        return {error_code_t::OK, ""};
+    };
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+        j.update(R"( {"tag": "SLICE"})"_json);
+    }
+#endif
+};
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node/softmax.h b/third_party/cudnn-frontend/include/cudnn_frontend/node/softmax.h
new file mode 100644
index 00000000..2263b490
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node/softmax.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include "../../cudnn_frontend_Heuristics.h"
+#include "../../cudnn_frontend_Logging.h"
+
+#include "../graph_helpers.h"
+#include "../node_interface.h"
+
+#include "pointwise.h"
+#include "reduction.h"
+
+namespace cudnn_frontend::graph {
+
+class SoftmaxNode : public NodeCRTP<SoftmaxNode> {
+   public:
+    Softmax_attributes attributes;
+
+    SoftmaxNode(Softmax_attributes&& attributes_, detail::Context const& context)
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
+
+    Type
+    getType() override final {
+        return Type::COMPOSITE;
+    }
+
+    error_t
+    pre_validate_node() const override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Validating SoftmaxNode " << attributes.name);
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    infer_properties_node() override final {
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    expand_node() override final {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO:     Inferrencing properties for Softmax node " << attributes.name);
+
+        attributes.fill_from_context(context);
+
+        // Fill properties of virtual tensors
+        auto const p_dim = attributes.inputs[Softmax_attributes::input_names::P]->get_dim();
+        auto b           = p_dim[0];
+        auto h           = p_dim[1];
+        auto s_q         = p_dim[2];
+
+        auto max_output = attributes.outputs[Softmax_attributes::output_names::Max];
+        if (max_output == nullptr) {
+            max_output = std::make_shared<Tensor_attributes>();
+            max_output->set_is_virtual(true);
+        }
+        //////////////// TODO //////////////////////////
+        // Check Stride (Before setting dimension?)
+        if (max_output->get_dim().empty()) {
+            max_output->set_dim({b, h, s_q, 1});
+        }
+        if (max_output->get_stride().empty()) {
+            max_output->set_stride({h * s_q, s_q, 1, 1});
+        }
+
+        auto max_attributes = Reduction_attributes().set_name("Max").set_mode(ReductionMode_t::MAX);
+        // If sink tensor is present, we also need to take a pointwise max with sink
+        if (attributes.inputs.find(Softmax_attributes::input_names::SINK) != attributes.inputs.end()) {
+            auto s_max = reduction(attributes.inputs[Softmax_attributes::input_names::P], max_attributes);
+            s_max->set_name("s_max");
+
+            auto sink_tensor     = attributes.inputs[Softmax_attributes::input_names::SINK];
+            auto sink_attributes = Pointwise_attributes().set_name("max_sink").set_mode(PointwiseMode_t::MAX);
+            pointwise(s_max, sink_tensor, sink_attributes, max_output);
+        } else {
+            // Special non-functional-style call. Needed because output already created and provided to user.
+            reduction(attributes.inputs[Softmax_attributes::input_names::P], max_attributes, max_output);
+        }
+
+        auto sub_attributes = Pointwise_attributes().set_name("sub").set_mode(PointwiseMode_t::SUB);
+        auto const& sub_output =
+            pointwise(attributes.inputs[Softmax_attributes::input_names::P], max_output, sub_attributes);
+        sub_output->set_name("sub_M");
+
+        auto exp_attributes    = Pointwise_attributes().set_name("exp").set_mode(PointwiseMode_t::EXP);
+        auto const& exp_output = pointwise(sub_output, exp_attributes);
+        exp_output->set_name("exp_sub_M");
+
+        auto sum_output = attributes.outputs[Softmax_attributes::output_names::Sum_exp];
+        if (sum_output == nullptr) {
+            sum_output = std::make_shared<Tensor_attributes>();
+            sum_output->set_is_virtual(true);
+        }
+        sum_output->set_name("SumExp");
+        if (sum_output->get_dim().empty()) {
+            sum_output->set_dim({b, h, s_q, 1});
+        }
+        if (sum_output->get_stride().empty()) {
+            sum_output->set_stride({h * s_q, s_q, 1, 1});
+        }
+        auto sum_attributes = Reduction_attributes().set_name("sum").set_mode(ReductionMode_t::ADD);
+        // If sink tensor is present, also subtract it and take its exp
+        if (attributes.inputs.find(Softmax_attributes::input_names::SINK) != attributes.inputs.end()) {
+            auto sink_tensor = attributes.inputs[Softmax_attributes::input_names::SINK];
+            auto sub_sink    = pointwise(sink_tensor, max_output, sub_attributes);
+            sub_sink->set_name("sub_sink");
+
+            auto exp_sink = pointwise(sub_sink, exp_attributes);
+            exp_sink->set_name("exp_sink");
+
+            auto temp_sum = reduction(exp_output, sum_attributes);
+            temp_sum->set_name("SumExp_elements").set_dim({b, h, s_q, 1}).set_stride({h * s_q, s_q, 1, 1});
+
+            auto add_attributes = Pointwise_attributes().set_name("add_sink").set_mode(PointwiseMode_t::ADD);
+            pointwise(temp_sum, exp_sink, add_attributes, sum_output);
+        } else {
+            reduction(exp_output, sum_attributes, sum_output);
+        }
+
+        // WAR when:
+        // - softmax stats in not requested
+        // - max and sum_exp are not requested
+        if (attributes.outputs[Softmax_attributes::output_names::Stats] == nullptr &&
+            attributes.outputs[Softmax_attributes::output_names::Max] == nullptr &&
+            attributes.outputs[Softmax_attributes::output_names::Sum_exp] == nullptr) {
+            auto softmax_stats = std::make_shared<Tensor_attributes>();
+            softmax_stats->set_is_virtual(true);
+            attributes.outputs[Softmax_attributes::output_names::Stats] = softmax_stats;
+        }
+
+        if (attributes.outputs.find(Softmax_attributes::output_names::Stats) != attributes.outputs.end() &&
+            attributes.outputs[Softmax_attributes::output_names::Stats] != nullptr) {
+            auto log_attributes    = Pointwise_attributes().set_name("log").set_mode(PointwiseMode_t::LOG);
+            auto const& log_output = pointwise(sum_output, log_attributes);
+            log_output->set_dim({b, h, s_q, 1}).set_stride({h * s_q, s_q, 1, 1});
+
+            auto add_attributes = Pointwise_attributes().set_name("add").set_mode(PointwiseMode_t::ADD);
+            // Special non-functional-style call. Needed because output already created and provided to user.
+            pointwise(
+                max_output, log_output, add_attributes, attributes.outputs[Softmax_attributes::output_names::Stats]);
+        }
+
+        auto div_attributes = Pointwise_attributes().set_name("div").set_mode(PointwiseMode_t::DIV);
+        // Special non-functional-style call. Needed because output already created and provided to user.
+        pointwise(exp_output, sum_output, div_attributes, attributes.outputs[Softmax_attributes::output_names::S]);
+
+        return {error_code_t::OK, ""};
+    }
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const override final {
+        j = attributes;
+    }
+#endif
+};
+
+inline void
+INode::softmax(std::shared_ptr<Tensor_attributes> p,
+               Softmax_attributes attributes,
+               std::shared_ptr<Tensor_attributes> s,
+               std::shared_ptr<Tensor_attributes> stats,
+               std::shared_ptr<Tensor_attributes> max,
+               std::shared_ptr<Tensor_attributes> sum_exp) {
+    attributes.inputs[Softmax_attributes::input_names::P]         = p;
+    attributes.outputs[Softmax_attributes::output_names::S]       = s;
+    attributes.outputs[Softmax_attributes::output_names::Stats]   = stats;
+    attributes.outputs[Softmax_attributes::output_names::Max]     = max;
+    attributes.outputs[Softmax_attributes::output_names::Sum_exp] = sum_exp;
+    sub_nodes.emplace_back(std::make_unique<SoftmaxNode>(std::move(attributes), context));
+}
+
+}  // namespace cudnn_frontend::graph
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/node_interface.h b/third_party/cudnn-frontend/include/cudnn_frontend/node_interface.h
new file mode 100644
index 00000000..7019d12c
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/node_interface.h
@@ -0,0 +1,487 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <unordered_map>
+#include <variant>
+#include <limits>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include "../cudnn_frontend_Tensor.h"
+#include "../cudnn_frontend_Operation.h"
+#include "../cudnn_frontend_OperationGraph.h"
+#include "../cudnn_frontend_ExecutionPlan.h"
+#include "../cudnn_frontend_VariantPack.h"
+#include "../cudnn_frontend_shim.h"
+
+#include "cudnn_interface.h"
+
+#include "graph_properties.h"
+
+namespace cudnn_frontend {
+
+namespace graph {
+
+class BatchNormNode;
+class DBNNode;
+class ConcatenateNode;
+class MatmulNode;
+class MatmulFP8Node;
+class PointwiseNode;
+class ReductionNode;
+class ResampleNode;
+class ReshapeNode;
+class RngNode;
+class SoftmaxNode;
+class MoeGroupedMatmulNode;
+
+// Interface for all nodes to follow.
+class INode {
+   public:
+    // A closed set of types that are allowed to be passed by value today
+    using pass_by_values_t = Tensor_attributes::pass_by_values_t;
+
+    detail::Context context;
+
+   protected:
+    // Will eventually be moved to Graph class
+    std::unordered_set<std::shared_ptr<Tensor_attributes>> full_graph_outputs;
+    std::shared_ptr<Tensor_attributes>
+    output_tensor(std::string const& name) {
+        auto tensor = std::make_shared<Tensor_attributes>();
+        tensor->set_name(name).set_is_virtual(true);
+        full_graph_outputs.insert(tensor);
+        return tensor;
+    }
+
+   private:
+    virtual error_t
+    pre_validate_node() const {
+        return {error_code_t::OK, ""};
+    };
+
+    virtual error_t
+    infer_properties_node() = 0;
+
+    virtual error_t
+    expand_node() {
+        return {error_code_t::OK, ""};
+    };
+
+    virtual error_t
+    post_validate_node() const {
+        return {error_code_t::OK, ""};
+    };
+
+    virtual int64_t
+    get_fe_workspace_size_node() const {
+        return 0;
+    }
+
+    virtual error_t
+    collect_pass_by_value_tensors_node(std::unordered_map<Tensor_attributes::uid_t, pass_by_values_t>&) const {
+        return {error_code_t::OK, ""};
+    };
+
+    virtual error_t
+    collect_variant_pack_replacements_node(
+        std::unordered_map<Tensor_attributes::uid_t, std::pair<Tensor_attributes::uid_t, int64_t>>&) const {
+        return {error_code_t::OK, ""};
+    };
+
+    virtual error_t
+    create_cudnn_tensors_node(
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors,
+        int64_t& potential_uid,
+        std::unordered_set<int64_t> const& used_uids) const = 0;
+
+    virtual error_t
+    collect_tensors_in_workspace_node(
+        std::unordered_map<Tensor_attributes::uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>&,
+        int64_t&) const {
+        return {error_code_t::OK, ""};
+    }
+
+   protected:
+    // Type of each node. Nodes can either be a composite (value COMPOSITE) or
+    // one of the other primitive types. Primitives types are nothing but
+    // cudnn operations.
+    enum class Type {
+        COMPOSITE,
+        BATCHNORM,
+        BATCHNORM_INFERENCE,
+        BN_FINALIZE,
+        CONVOLUTION,
+        DBN,
+        DBN_WEIGHT,
+        DLN,
+        DIN,
+        DGRAD,
+        DRMSNorm,
+        GENSTATS,
+        LAYERNORM,
+        INSTANCENORM,
+        MATMUL,
+        POINTWISE,
+        REDUCTION,
+        RESAMPLE,
+        RESHAPE,
+        RMSNORM,
+        RNG,
+        SLICE,
+        WGRAD,
+        PAGED_CACHE_LOAD,
+        BLOCK_SCALE_QUANTIZE,
+        BLOCK_SCALE_DEQUANTIZE,
+        CONCATENATE,
+        ADALAYERNORM,
+        DADALAYERNORM,
+        UNIFIED_SDPA,
+        MOE_GROUPED_MATMUL,
+    };
+    Type tag;
+
+    inline void
+    matmul(std::shared_ptr<Tensor_attributes> a,
+           std::shared_ptr<Tensor_attributes> b,
+           Matmul_attributes attributes,
+           std::shared_ptr<Tensor_attributes> c);
+
+    void
+    matmul_fp8(std::shared_ptr<Tensor_attributes> a,
+               std::shared_ptr<Tensor_attributes> b,
+               std::shared_ptr<Tensor_attributes> descale_a,
+               std::shared_ptr<Tensor_attributes> descale_b,
+               std::shared_ptr<Tensor_attributes> scale_c,
+               Matmul_fp8_attributes attributes,
+               std::shared_ptr<Tensor_attributes> c,
+               std::shared_ptr<Tensor_attributes> amax_c);
+
+    void
+    softmax(std::shared_ptr<Tensor_attributes> p,
+            Softmax_attributes attributes,
+            std::shared_ptr<Tensor_attributes> s,
+            std::shared_ptr<Tensor_attributes> stats,
+            std::shared_ptr<Tensor_attributes> max,
+            std::shared_ptr<Tensor_attributes> sum_exp);
+
+    void
+    softmax(std::shared_ptr<Tensor_attributes> p,
+            Softmax_attributes attributes,
+            std::shared_ptr<Tensor_attributes> s,
+            std::shared_ptr<Tensor_attributes> m,
+            std::shared_ptr<Tensor_attributes> zinv);
+
+    void
+    pointwise(std::shared_ptr<Tensor_attributes> a,
+              Pointwise_attributes attributes,
+              std::shared_ptr<Tensor_attributes> c);
+
+    void
+    pointwise(std::shared_ptr<Tensor_attributes> a,
+              std::shared_ptr<Tensor_attributes> b,
+              Pointwise_attributes attributes,
+              std::shared_ptr<Tensor_attributes> c);
+
+    void
+    reduction(std::shared_ptr<Tensor_attributes> a,
+              Reduction_attributes attributes,
+              std::shared_ptr<Tensor_attributes> c);
+
+    void
+    rng(std::shared_ptr<Tensor_attributes> seed,
+        std::shared_ptr<Tensor_attributes> offset,
+        Rng_attributes attributes,
+        std::shared_ptr<Tensor_attributes> y);
+
+    void
+    paged_cache_load(std::shared_ptr<Tensor_attributes> container,
+                     std::shared_ptr<Tensor_attributes> seqLen,
+                     std::shared_ptr<Tensor_attributes> pageTable,
+                     PagedCacheLoad_attributes attributes,
+                     std::shared_ptr<Tensor_attributes> yOut);
+
+    void
+    block_scale_quantize(std::shared_ptr<Tensor_attributes> x,
+                         Block_scale_quantize_attributes attributes,
+                         std::shared_ptr<Tensor_attributes> y,
+                         std::shared_ptr<Tensor_attributes> scale);
+
+    void
+    block_scale_dequantize(std::shared_ptr<Tensor_attributes> x,
+                           std::shared_ptr<Tensor_attributes> scale,
+                           Block_scale_dequantize_attributes attributes,
+                           std::shared_ptr<Tensor_attributes> y);
+
+    void
+    concatenate(std::vector<std::shared_ptr<Tensor_attributes>> x,
+                Concatenate_attributes attributes,
+                std::shared_ptr<Tensor_attributes> y);
+
+    void
+    moe_grouped_matmul(std::shared_ptr<Tensor_attributes> token,
+                       std::shared_ptr<Tensor_attributes> weight,
+                       std::shared_ptr<Tensor_attributes> first_token_offset,
+                       std::shared_ptr<Tensor_attributes> token_index,
+                       std::shared_ptr<Tensor_attributes> token_ks,
+                       Moe_grouped_matmul_attributes attributes,
+                       std::shared_ptr<Tensor_attributes> output);
+
+    error_t
+    validate_subtree() {
+        // pre validate to catch errors early
+        // Otherwise code reability decreases in expand_and_infer
+        CHECK_CUDNN_FRONTEND_ERROR(pre_validate_node());
+        CHECK_CUDNN_FRONTEND_ERROR(infer_properties_node());
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->validate_subtree());
+        }
+        CHECK_CUDNN_FRONTEND_ERROR(post_validate_node());
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    expand_subtree() {
+        // Validate self
+        CHECK_CUDNN_FRONTEND_ERROR(pre_validate_node());
+        CHECK_CUDNN_FRONTEND_ERROR(infer_properties_node());
+        CHECK_CUDNN_FRONTEND_ERROR(expand_node());
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->expand_subtree());
+        }
+        CHECK_CUDNN_FRONTEND_ERROR(post_validate_node());
+        return {error_code_t::OK, ""};
+    }
+
+    // Creates cudnn tensors for each node (and its sub nodes)
+    error_t
+    create_cudnn_tensors_subtree(
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors,
+        int64_t& potential_uid,
+        std::unordered_set<int64_t> const& used_uids) const {
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors_node(uid_to_backend_tensors, potential_uid, used_uids));
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(
+                sub_node->create_cudnn_tensors_subtree(uid_to_backend_tensors, potential_uid, used_uids));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    collect_pass_by_value_tensors_subtree(
+        std::unordered_map<Tensor_attributes::uid_t, pass_by_values_t>& tensor_to_pass_by_value) const {
+        CHECK_CUDNN_FRONTEND_ERROR(collect_pass_by_value_tensors_node(tensor_to_pass_by_value));
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->collect_pass_by_value_tensors_subtree(tensor_to_pass_by_value));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    collect_tensors_in_workspace_subtree(
+        std::unordered_map<Tensor_attributes::uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>&
+            worskspace_modifications,
+        int64_t& offset) const {
+        CHECK_CUDNN_FRONTEND_ERROR(collect_tensors_in_workspace_node(worskspace_modifications, offset));
+        offset = get_fe_workspace_size_node();
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(
+                sub_node->collect_tensors_in_workspace_subtree(worskspace_modifications, offset));
+            offset += sub_node->get_fe_workspace_size_node();
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    collect_variant_pack_replacements_subtree(
+        std::unordered_map<Tensor_attributes::uid_t, std::pair<Tensor_attributes::uid_t, int64_t>>& replacements)
+        const {
+        CHECK_CUDNN_FRONTEND_ERROR(collect_variant_pack_replacements_node(replacements));
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->collect_variant_pack_replacements_subtree(replacements));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    int64_t
+    get_fe_workspace_size_subtree() const {
+        int64_t fe_workspace_size = get_fe_workspace_size_node();
+        for (auto const& sub_node : sub_nodes) {
+            fe_workspace_size += sub_node->get_fe_workspace_size_subtree();
+        }
+        return fe_workspace_size;
+    }
+
+    // Creates cudnn operation for each node (and its sub nodes)
+    // Only INode that map to a primitive cudnn operation need to specialize.
+    virtual error_t
+    create_cudnn_operations(
+        std::unordered_set<Tensor_attributes::uid_t>& uids_involved_in_operation,
+        std::vector<std::shared_ptr<cudnn_frontend::Operation>>& backend_operations,
+        managed_backend_descriptor_t& raw_operations,
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors) const {
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_operations(
+                uids_involved_in_operation, backend_operations, raw_operations, uid_to_backend_tensors));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    // An implicitly topological-sorted vector of sub nodes.
+    // The sorted order is a side effect of functional API.
+    std::vector<std::shared_ptr<INode>> sub_nodes;
+
+   public:
+    virtual Type
+    getType() = 0;
+
+    virtual std::pair<int64_t, std::unordered_map<KnobType_t, int64_t>>
+    override_heuristics_query() const {
+        return {-1, {}};
+    }
+
+    std::shared_ptr<Tensor_attributes> matmul(std::shared_ptr<Tensor_attributes>,
+                                              std::shared_ptr<Tensor_attributes>,
+                                              Matmul_attributes);
+
+    std::shared_ptr<Tensor_attributes> pointwise(std::shared_ptr<Tensor_attributes>, Pointwise_attributes);
+    std::shared_ptr<Tensor_attributes> pointwise(std::shared_ptr<Tensor_attributes>,
+                                                 std::shared_ptr<Tensor_attributes>,
+                                                 Pointwise_attributes);
+    std::shared_ptr<Tensor_attributes> pointwise(std::shared_ptr<Tensor_attributes>,
+                                                 std::shared_ptr<Tensor_attributes>,
+                                                 std::shared_ptr<Tensor_attributes>,
+                                                 Pointwise_attributes);
+
+    std::shared_ptr<Tensor_attributes> reduction(std::shared_ptr<Tensor_attributes>, Reduction_attributes);
+    std::array<std::shared_ptr<Tensor_attributes>, 2> resample(std::shared_ptr<Tensor_attributes>, Resample_attributes);
+    std::shared_ptr<Tensor_attributes> reshape(std::shared_ptr<Tensor_attributes>, Reshape_attributes);
+
+    std::shared_ptr<Tensor_attributes> rng(std::shared_ptr<Tensor_attributes>,
+                                           std::shared_ptr<Tensor_attributes>,
+                                           Rng_attributes);
+
+    INode(detail::Context const& context) : context(context) {}
+
+    // Make sure each node implements a public serialize function
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+    virtual void
+    serialize(json& j) const = 0;
+#endif
+
+    virtual size_t
+    key() {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        json j;
+        serialize(j);
+        return std::hash<json>{}(j);
+#else
+        return 1;
+#endif
+    }
+
+    virtual ~INode() = default;
+};
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+[[maybe_unused]] static void
+to_json(json& j, const INode& p) {
+    p.serialize(j);
+}
+#endif
+
+template <typename DerivedT>
+class NodeCRTP : public INode {
+    DerivedT&
+    self() {
+        return *static_cast<DerivedT*>(this);
+    }
+    DerivedT const&
+    self() const {
+        return *static_cast<DerivedT const*>(this);
+    }
+
+    error_t
+    collect_pass_by_value_tensors_node(
+        std::unordered_map<Tensor_attributes::uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
+        CHECK_CUDNN_FRONTEND_ERROR(self().attributes.fill_pass_by_value(tensor_to_pass_by_value));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_tensors_node(std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
+                              int64_t& potential_uid,
+                              std::unordered_set<int64_t> const& used_uids) const override {
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: Creating cudnn tensors for node named '" << self().attributes.name << "':");
+
+        if constexpr (std::is_same_v<DerivedT, ConcatenateNode>) {
+            for (auto const& tensor : self().attributes.inputs) {
+                if (tensor) {
+                    CHECK_CUDNN_FRONTEND_ERROR(detail::create_cudnn_tensor(tensor, tensors, potential_uid, used_uids));
+                }
+            }
+        } else {
+            for (auto const& [name, tensor] : self().attributes.inputs) {
+                (void)name;
+                if (tensor) {
+                    CHECK_CUDNN_FRONTEND_ERROR(detail::create_cudnn_tensor(tensor, tensors, potential_uid, used_uids));
+                }
+            }
+        }
+
+        for (auto const& [name, tensor] : self().attributes.outputs) {
+            (void)name;
+            if (tensor) {
+                CHECK_CUDNN_FRONTEND_ERROR(detail::create_cudnn_tensor(tensor, tensors, potential_uid, used_uids));
+            }
+        }
+
+        // Handle special case of BN where peer_stats is also an input
+        if constexpr (std::is_same_v<DerivedT, DBNNode> || std::is_same_v<DerivedT, BatchNormNode>) {
+            // Special case in BN where peer stats is also an input but is not present in inputs map
+            for (auto const& tensor : self().attributes.peer_stats) {
+                if (tensor) {
+                    CHECK_CUDNN_FRONTEND_ERROR(detail::create_cudnn_tensor(tensor, tensors, potential_uid, used_uids));
+                }
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+   protected:
+    using INode::INode;
+};
+
+#define CUDNN_FE_VALIDATE_TENSOR_(port, map_)                                                      \
+    {                                                                                              \
+        auto t           = map_.find(port);                                                        \
+        bool const has_t = (t != map_.end()) && (t->second != nullptr);                            \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(                                                            \
+            !has_t, error_code_t::ATTRIBUTE_NOT_SET, std::string("Tensor ") + #port + " not set"); \
+    }
+
+#define CUDNN_FE_VALIDATE_AND_ASSIGN_TENSOR_(tensor, port, map_)                                   \
+    auto tensor = map_.find(port);                                                                 \
+    {                                                                                              \
+        bool const has_t = (tensor != map_.end()) && (tensor->second != nullptr);                  \
+        RETURN_CUDNN_FRONTEND_ERROR_IF(                                                            \
+            !has_t, error_code_t::ATTRIBUTE_NOT_SET, std::string("Tensor ") + #port + " not set"); \
+    }
+
+#define CUDNN_FE_VALIDATE_INPUT_TENSOR(port) CUDNN_FE_VALIDATE_TENSOR_(port, attributes.inputs)
+
+#define CUDNN_FE_VALIDATE_OUTPUT_TENSOR(port) CUDNN_FE_VALIDATE_TENSOR_(port, attributes.outputs)
+
+#define CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(tensor, port) \
+    CUDNN_FE_VALIDATE_AND_ASSIGN_TENSOR_(tensor, port, attributes.inputs)
+
+#define CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(tensor, port) \
+    CUDNN_FE_VALIDATE_AND_ASSIGN_TENSOR_(tensor, port, attributes.outputs)
+
+}  // namespace graph
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/plans.h b/third_party/cudnn-frontend/include/cudnn_frontend/plans.h
new file mode 100644
index 00000000..c30812f2
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/plans.h
@@ -0,0 +1,694 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "../cudnn_frontend_EngineConfig.h"
+#include "../cudnn_frontend_Logging.h"
+#include "graph_helpers.h"
+
+#include "backend/execution_helpers.h"
+#include "backend/plan_helpers.h"
+
+namespace cudnn_frontend {
+
+namespace detail {
+
+inline error_t
+execute(cudnnHandle_t handle,
+        ExecutionPlan* plan,
+        std::vector<void*>& device_ptrs,
+        std::vector<int64_t> const& uids,
+        void* workspace_ptr,
+        std::vector<int64_t> const& override_uids,
+        std::vector<std::vector<int64_t>> const& override_shapes,
+        std::vector<std::vector<int64_t>> const& override_strides) {
+    // TODO: below line fails with MSVC. warning C4127: conditional expression is constant
+    // RETURN_CUDNN_FRONTEND_ERROR_IF(!plan, error_code_t::GRAPH_EXECUTION_FAILED, "No plan found to execute!!");
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: Executing " << plan->getTag() << "...");
+
+    backend_descriptor variant_pack_descriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR);
+    RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack_descriptor.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::CUDNN_BACKEND_API_FAILED,
+                                   "Failed to create variant pack's backend descriptor.");
+
+    CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(
+        variant_pack_descriptor, device_ptrs, uids, workspace_ptr, override_uids, override_shapes, override_strides));
+    _CUDNN_CHECK_CUDNN_ERROR(execute(handle, plan->get_raw_desc(), variant_pack_descriptor.get_ptr()));
+
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: Executed " << plan->getTag() << ".");
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+execute(cudnnHandle_t handle,
+        ExecutionPlan* plan,
+        std::vector<void*>& device_ptrs,
+        std::vector<int64_t> const& uids,
+        void* workspace_ptr) {
+    // TODO: below line fails with MSVC. warning C4127: conditional expression is constant
+    // RETURN_CUDNN_FRONTEND_ERROR_IF(!plan, error_code_t::GRAPH_EXECUTION_FAILED, "No plan found to execute!!");
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: Executing " << plan->getTag() << "...");
+
+    backend_descriptor variant_pack_descriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR);
+    RETURN_CUDNN_FRONTEND_ERROR_IF(variant_pack_descriptor.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::CUDNN_BACKEND_API_FAILED,
+                                   "Failed to create variant pack's backend descriptor.");
+
+    CHECK_CUDNN_FRONTEND_ERROR(create_variant_pack(variant_pack_descriptor, device_ptrs, uids, workspace_ptr));
+    _CUDNN_CHECK_CUDNN_ERROR(execute(handle, plan->get_raw_desc(), variant_pack_descriptor.get_ptr()));
+
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: Executed " << plan->getTag() << ".");
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+query_cudnn_heuristics_impl(std::shared_ptr<OperationGraph_v8> const& operation_graph,
+                            cudnn_frontend::EngineConfigList& configs,
+                            std::vector<HeurMode_t> const& modes,
+                            int32_t sm_count,
+                            std::shared_ptr<const DeviceProperties> device_properties = nullptr) {
+    RETURN_CUDNN_FRONTEND_ERROR_IF(
+        operation_graph == nullptr,
+        error_code_t::HEURISTIC_QUERY_FAILED,
+        "Empty operation graph provided. Did you forget to call graph.build_operation_graph()?");
+
+    auto const& operation_graph_tag = operation_graph->getTag();
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: " << " Getting plan from heuristics for " << operation_graph_tag << " ...");
+
+    std::vector<cudnnStatus_t> statuses;
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    statuses = cudnn_frontend::get_heuristics_list(
+        modes, *operation_graph, allowAllConfig, configs, true, sm_count, device_properties);
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        statuses = cudnn_frontend::get_heuristics_list(
+            modes, *operation_graph, allowAllConfig, configs, true, sm_count, device_properties);
+    } catch (cudnn_frontend::cudnnException& e) {
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::HEURISTIC_QUERY_FAILED, e.what());
+        CUDNN_FE_LOG_LABEL("ERROR: " << e.what() << ". ");
+        CUDNN_FE_LOG(error_code_t::HEURISTIC_QUERY_FAILED << " because querying heuristics failed at " << __FILE__
+                                                          << ":" << __LINE__ << "\n");
+        return {error_code_t::HEURISTIC_QUERY_FAILED, e.what()};
+    }
+#endif
+
+    CUDNN_FE_LOG_LABEL("INFO: get_heuristics_list statuses: ");
+    for (size_t i = 0; i < statuses.size(); i++) {
+        CUDNN_FE_LOG(cudnn_frontend::to_string(statuses[i]) << " ");
+    }
+    CUDNN_FE_LOG(std::endl);
+
+    CUDNN_FE_LOG_LABEL_ENDL("INFO: config list has " << configs.size() << " configurations.");
+
+    if (configs.empty()) {
+        std::string err_msg = detail::get_last_error_string_();
+        CUDNN_FE_LOG_LABEL_ENDL("ERROR: No valid engine configs returned from heuristics.\n" << err_msg);
+        return {error_code_t::HEURISTIC_QUERY_FAILED,
+                "No valid engine configs for " + operation_graph_tag + "\n" + err_msg};
+    }
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+create_cudnn_execution_plan(std::shared_ptr<ExecutionPlan>& plan,
+                            std::string const& serialized_data,
+                            cudnnHandle_t handle) {
+    auto&& plan_builder = cudnn_frontend::ExecutionPlanBuilder();
+
+    plan_builder.setHandle(handle);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    auto built_plan = plan_builder.loadFromJson(serialized_data);
+    RETURN_CUDNN_FRONTEND_ERROR_IF(built_plan.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                   built_plan.get_error());
+    plan = std::make_shared<ExecutionPlan>(std::move(built_plan));
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        auto built_plan = plan_builder.loadFromJson(serialized_data);
+        plan            = std::make_shared<ExecutionPlan>(std::move(built_plan));
+    } catch (cudnn_frontend::cudnnException& e) {
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+        //     e.what());
+        CUDNN_FE_LOG_LABEL(" ERROR: " << e.what() << ". ");
+        CUDNN_FE_LOG(error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED << " because plan building failed at "
+                                                                        << __FILE__ << ":" << __LINE__ << "\n");
+        return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, e.what()};
+    }
+#endif
+
+    return {error_code_t::OK, ""};
+}
+
+inline error_t
+create_cudnn_execution_plan(std::shared_ptr<ExecutionPlan>& plan,
+                            ManagedOpaqueDescriptor const& config,
+                            std::string const& operation_graph_tag,
+                            std::shared_ptr<KernelCache> kernel_cache) {
+    auto&& plan_builder = cudnn_frontend::ExecutionPlanBuilder();
+
+    plan_builder.setEngineConfig(config, operation_graph_tag).setKernelCache(kernel_cache);
+
+#ifdef NV_CUDNN_DISABLE_EXCEPTION
+    // disable exception macro is defined. Calling build will not throw.
+    // Check status of desc and return error.
+    auto built_plan = plan_builder.build();
+    RETURN_CUDNN_FRONTEND_ERROR_IF(built_plan.get_status() != CUDNN_STATUS_SUCCESS,
+                                   error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                   built_plan.get_error());
+    plan = std::make_shared<ExecutionPlan>(std::move(built_plan));
+#else
+    // build() can throw
+    // wrap in try catch
+    try {
+        auto built_plan = plan_builder.build();
+        plan            = std::make_shared<ExecutionPlan>(std::move(built_plan));
+    } catch (cudnn_frontend::cudnnException& e) {
+        // Silly MSVC error that thinks below condition is constexpr
+        // RETURN_CUDNN_FRONTEND_ERROR_IF(
+        //     e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+        //     e.what());
+        CUDNN_FE_LOG_LABEL("ERROR: " << e.what() << ". ");
+        CUDNN_FE_LOG(error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED << " because plan building failed at "
+                                                                        << __FILE__ << ":" << __LINE__ << "\n");
+        return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED, e.what()};
+    }
+#endif
+
+    return {error_code_t::OK, ""};
+}
+
+}  // namespace detail
+
+namespace graph {
+class Execution_plan_list {
+    std::string operation_tag;
+
+    std::vector<bool> barred_indices;
+    std::shared_ptr<KernelCache> kernel_cache = nullptr;
+
+    int64_t max_workspace_allowed  = std::numeric_limits<int64_t>::max();
+    int64_t max_shared_mem_allowed = 1024 * 1024 * 1024;  // Crazy high number (2GB) which will never be hit
+
+    std::vector<std::string> barred_engine_names = {};
+    EngineConfigList engine_configs;
+
+    error_t
+    _build_plan_at_index_impl(int64_t index) {
+        if (execution_plans[index] == nullptr) {
+            CHECK_CUDNN_FRONTEND_ERROR(detail::create_cudnn_execution_plan(
+                execution_plans[index], engine_configs[index], operation_tag, kernel_cache));
+        }
+
+        auto is_blocked = [](std::string const& full_name, std::vector<std::string> const& blocked_names) -> bool {
+            for (auto const& blocked_name : blocked_names) {
+                if (full_name.find(blocked_name) != std::string::npos) {
+                    return true;
+                }
+            }
+            return false;
+        };
+        auto const& plan_tag = execution_plans[index]->getTag();
+        if (is_blocked(plan_tag, barred_engine_names)) {
+            barred_indices[index] = true;
+
+            return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                    "[cudnn_frontend] Error: Deselecting execution plan with name " + plan_tag + " at position " +
+                        std::to_string(index)};
+        }
+
+        // workspace check for 9.2+ is already done at engine config level
+        if (detail::get_backend_version() < 90200 || detail::get_compiled_version() < 90200) {
+            if (execution_plans[index]->getWorkspaceSize() > max_workspace_allowed) {
+                barred_indices[index] = true;
+                return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                        "[cudnn_frontend] Error: Workspace size is too large."};
+            }
+        }
+
+        // Sets candidate in case user does not call execute with plan_index later.
+        candidate = index;
+
+        return {error_code_t::OK, ""};
+    }
+
+   public:
+    std::vector<std::vector<NumericalNote_t>> numeric_notes;
+    std::vector<std::vector<BehaviorNote_t>> behavior_notes;
+
+    std::vector<std::shared_ptr<ExecutionPlan>>
+        execution_plans;  // a built plan corresponding to each engine config, irrespective of whether config is
+                          // selected or deselected.
+
+    // Stores position of best plan in above vector of execution plan
+    int64_t candidate = -1;
+
+    void
+    set_tag(std::string const& tag) {
+        operation_tag = tag;
+    }
+    void
+    enqueue_engine_configs(EngineConfigList list) {
+        std::move(list.begin(), list.end(), back_inserter(engine_configs));
+    }
+    void
+    set_kernel_cache(std::shared_ptr<KernelCache> kernel_cache_) {
+        kernel_cache = kernel_cache_;
+    }
+
+    std::vector<std::shared_ptr<ExecutionPlan>>&
+    get_execution_plans() {
+        return execution_plans;
+    }
+
+    error_t
+    query_properties() {
+        numeric_notes.reserve(engine_configs.size());
+        behavior_notes.reserve(engine_configs.size());
+
+        barred_indices.resize(engine_configs.size(), 0);
+        execution_plans.resize(engine_configs.size());
+
+        for (auto& engine_config : engine_configs) {
+            int64_t elem_count = 0;
+            std::vector<cudnnBackendNumericalNote_t> numeric;
+            std::vector<cudnnBackendBehaviorNote_t> behavior;
+
+            ManagedOpaqueDescriptor extractedEngine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+            cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
+            auto status                               = detail::get_attribute(engine_config->get_backend_descriptor(),
+                                                CUDNN_ATTR_ENGINECFG_ENGINE,
+                                                CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                1,
+                                                &elem_count,
+                                                &extractedEngine_);
+            RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
+                                           error_code_t::HEURISTIC_QUERY_FAILED,
+                                           "Heuristic query Engine failed.");
+
+            status = detail::get_attribute(extractedEngine_,
+                                           CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                           CUDNN_TYPE_NUMERICAL_NOTE,
+                                           CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                           &elem_count,
+                                           nullptr);
+            RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
+                                           error_code_t::HEURISTIC_QUERY_FAILED,
+                                           "Heuristic query Numerical Note failed");
+
+            numeric.resize(static_cast<size_t>(elem_count));
+            status = detail::get_attribute(extractedEngine_,
+                                           CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                           CUDNN_TYPE_NUMERICAL_NOTE,
+                                           CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                           &elem_count,
+                                           numeric.data());
+            RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
+                                           error_code_t::HEURISTIC_QUERY_FAILED,
+                                           "Heuristic query Numerical Note failed");
+            status = detail::get_attribute(extractedEngine_,
+                                           CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                           CUDNN_TYPE_BEHAVIOR_NOTE,
+                                           CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                           &elem_count,
+                                           nullptr);
+            RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
+                                           error_code_t::HEURISTIC_QUERY_FAILED,
+                                           "Heuristic query Behavior Note failed");
+
+            behavior.resize(static_cast<size_t>(elem_count));
+            status = detail::get_attribute(extractedEngine_,
+                                           CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                           CUDNN_TYPE_BEHAVIOR_NOTE,
+                                           CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                           &elem_count,
+                                           behavior.data());
+            RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
+                                           error_code_t::HEURISTIC_QUERY_FAILED,
+                                           "Heuristic query Behavior Note failed");
+
+            std::vector<NumericalNote_t> numerics;
+            numerics.resize(numeric.size());
+            for (auto& note : numeric) {
+                numerics.push_back(detail::convert_from_cudnn_type(note));
+            }
+            numeric_notes.emplace_back(std::move(numerics));
+
+            std::vector<BehaviorNote_t> behaviors;
+            behaviors.reserve(behaviors.size());
+            for (auto& note : behavior) {
+                behaviors.push_back(detail::convert_from_cudnn_type(note));
+            }
+            behavior_notes.emplace_back(std::move(behaviors));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    filter_numeric_notes(std::vector<NumericalNote_t> const& notes, bool const keep) {
+        for (auto& note : notes) {
+            for (auto i = 0u; i < engine_configs.size(); i++) {
+                bool has_barred_note =
+                    std::find(numeric_notes[i].begin(), numeric_notes[i].end(), note) != numeric_notes[i].end();
+
+                barred_indices[i] = barred_indices[i] || (has_barred_note ? !keep : keep);
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    filter_behavior_notes(std::vector<BehaviorNote_t> const& notes, bool const keep) {
+        for (auto& note : notes) {
+            for (auto i = 0u; i < engine_configs.size(); i++) {
+                bool has_barred_note =
+                    std::find(behavior_notes[i].begin(), behavior_notes[i].end(), note) != behavior_notes[i].end();
+
+                barred_indices[i] = barred_indices[i] || (has_barred_note ? !keep : keep);
+            }
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    void
+    set_max_workspace_allowed(int64_t const workspace_allowed) {
+        max_workspace_allowed = workspace_allowed;
+    }
+
+    void
+    set_max_shared_mem_allowed(int64_t const smem_allowed) {
+        max_shared_mem_allowed = smem_allowed;
+    }
+
+    void
+    set_barred_names(std::vector<std::string> const& engine_names) {
+        barred_engine_names = engine_names;
+    }
+
+    EngineConfigList
+    get_barred_engine_configs() {
+        EngineConfigList barred_engine_configs;
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: " << " Filtering engine_configs ..." << engine_configs.size());
+        for (auto i = 0u; i < engine_configs.size(); i++) {
+            if (barred_indices[i] == false) {
+                barred_engine_configs.push_back(engine_configs[i]);
+            }
+        }
+        CUDNN_FE_LOG_LABEL_ENDL("INFO: " << " barred engine_configs ..." << barred_engine_configs.size());
+        return barred_engine_configs;
+    }
+
+    error_t
+    get_name_at_index(int64_t index, std::string& name) const {
+        name = detail::get_engine_tag(engine_configs[index]);
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    check_support_at_index(int64_t index) {
+        // Ignore if the engine config was deselected.
+        // This usually happens when user deselects by numerical and behavioural notes.
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF((index < 0) || (static_cast<int64_t>(barred_indices.size()) <= index),
+                                       error_code_t::GRAPH_EXECUTION_FAILED,
+                                       "Plan index " + std::to_string(index) + " is invalid.");
+
+        if (barred_indices[index] == true) {
+            CUDNN_FE_LOG_LABEL_ENDL("Deselecting execution plan at position " << index);
+        }
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(barred_indices[index] == true,
+                                       error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                       "Deselecting execution plan");
+
+        // Ignore if engine name was specified to be ignored by the user.
+        auto is_blocked = [](std::string const& full_name, std::vector<std::string> const& blocked_names) -> bool {
+            for (auto const& blocked_name : blocked_names) {
+                if (full_name.find(blocked_name) != std::string::npos) {
+                    return true;
+                }
+            }
+            return false;
+        };
+        auto cfg_tag = detail::get_engine_tag(engine_configs[index]);
+        if (is_blocked(cfg_tag, barred_engine_names)) {
+            barred_indices[index] = true;
+            return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                    "[cudnn_frontend] Error: Deselecting execution plan with name " + cfg_tag + " at position " +
+                        std::to_string(index)};
+        }
+
+        if (detail::get_backend_version() >= 90200 && detail::get_compiled_version() >= 90200) {
+            // Ignore kernels that require larger than tolerable shared memory.
+            int32_t shared_memory_size = INT32_MAX;
+            auto status                = detail::get_shared_memory_size(engine_configs[index], shared_memory_size);
+            if (status.is_bad()) {
+                CUDNN_FE_LOG_LABEL_ENDL("WARN: Unknown Shared memory size, so not deselecting plan at position "
+                                        << index);
+            } else if (shared_memory_size > max_shared_mem_allowed) {
+                barred_indices[index] = true;
+                return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                        "[cudnn_frontend] Error: Skipping plan since shared memory violation. Requires " +
+                            std::to_string(shared_memory_size)};
+            }
+
+            // Filter by workspace can happen at this engine config stage itself.
+            int64_t workspace_size = INT64_MAX;
+            CHECK_CUDNN_FRONTEND_ERROR(detail::get_workspace_size(engine_configs[index], workspace_size));
+            if (workspace_size > max_workspace_allowed) {
+                barred_indices[index] = true;
+                return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                        "[cudnn_frontend] Error: Skipping plan since workspace violation. Requires " +
+                            std::to_string(workspace_size)};
+            }
+        }
+        // Else we need to build the config. A successful execution plan build means that check_support succeeded.
+        else {
+            CHECK_CUDNN_FRONTEND_ERROR(_build_plan_at_index_impl(index));
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL("Check support for index " << index << " passed with cfg " << cfg_tag);
+        // All checks passed for this config, so return success.
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    check_support() {
+        // Go over each engine config and return true when you find the first one that is supported.
+        for (auto i = 0u; i < engine_configs.size(); i++) {
+            auto status = check_support_at_index(i);
+            if (status.is_good()) {
+                return {error_code_t::OK, ""};
+            }
+        }
+
+        std::string err_msg = detail::get_last_error_string_();
+        CUDNN_FE_LOG_LABEL_ENDL("ERROR: No valid engine configs returned from heuristics.\n" << err_msg);
+        return {error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                "[cudnn_frontend] Error: No execution plans support the graph." + err_msg};
+    }
+
+    error_t
+    get_behavior_notes_at_index(int64_t const index, std::vector<BehaviorNote_t>& notes) const {
+        RETURN_CUDNN_FRONTEND_ERROR_IF((index < 0) || (static_cast<int64_t>(behavior_notes.size()) <= index),
+                                       error_code_t::GRAPH_EXECUTION_FAILED,
+                                       "Plan index " + std::to_string(index) + " is invalid.");
+
+        notes = behavior_notes[index];
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    build_plans(cudnnHandle_t handle, std::string const& json) {
+        execution_plans.resize(1);
+        auto const& fe_status = detail::create_cudnn_execution_plan(execution_plans[0], json, handle);
+
+        if (fe_status.is_good()) {
+            candidate = 0;
+        }
+
+        return fe_status;
+    }
+
+    error_t
+    build_plan_at_index(int64_t index) {
+        CHECK_CUDNN_FRONTEND_ERROR(check_support_at_index(index));
+        CHECK_CUDNN_FRONTEND_ERROR(_build_plan_at_index_impl(index));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    build_plans(BuildPlanPolicy_t const policy, bool const do_multithreaded_builds) {
+        RETURN_CUDNN_FRONTEND_ERROR_IF(do_multithreaded_builds,
+                                       error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                       "Doing multithreaded builds is not yet supported.");
+
+        // short circuit in case a plan was already created.
+        // This happens as check_support for v8 builds a plan.
+        if (policy == BuildPlanPolicy_t::HEURISTICS_CHOICE && candidate != -1) {
+            return {error_code_t::OK, ""};
+        }
+
+        for (auto i = 0u; i < engine_configs.size(); i++) {
+            auto status = build_plan_at_index(i);
+            if (status.is_bad()) {
+                CUDNN_FE_LOG_LABEL_ENDL("WARN: Failed to build plan at " << i);
+                continue;
+            }
+
+            // Only set the candidate the first time, as the order of iteration is from highest to lowest priority
+            if (candidate == -1) {
+                candidate = static_cast<int64_t>(i);
+                CUDNN_FE_LOG_LABEL_ENDL("INFO: Candidate set as " << i);
+            }
+
+            // Return from this function as first successfully built plan is found.
+            if (policy == BuildPlanPolicy_t::HEURISTICS_CHOICE) {
+                return {error_code_t::OK, ""};
+            }
+        }
+
+        // Return an error if no execution plans could be built
+        RETURN_CUDNN_FRONTEND_ERROR_IF(candidate == -1,
+                                       error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED,
+                                       "[cudnn_frontend] Error: No valid execution plans built.");
+
+        return {error_code_t::OK, ""};
+    }
+
+    int64_t
+    get_autotune_workspace() const {
+        int64_t max_size = 0;
+        for (auto& plan : execution_plans) {
+            max_size = std::max(max_size, plan->getWorkspaceSize());
+        }
+        return max_size;
+    }
+
+    static error_t
+    autotune_default_impl(std::vector<std::shared_ptr<ExecutionPlan>>& execution_plans,
+                          cudnnHandle_t handle,
+                          std::unordered_map<int64_t, void*> const& tensor_to_pointer_map,
+                          void* workspace_ptr,
+                          void*) {
+        // Create the variant pack for all the plans to use.
+        std::vector<int64_t> uids;
+        std::vector<void*> ptrs;
+        for (auto it : tensor_to_pointer_map) {
+            uids.push_back(it.first);
+            ptrs.push_back(it.second);
+        }
+
+        std::vector<std::shared_ptr<ExecutionPlan>> time_sorted_plans;
+
+        auto plan_cmp = [](std::shared_ptr<ExecutionPlan> a, std::shared_ptr<ExecutionPlan> b) {
+            return a->getExecutionTime() < b->getExecutionTime();
+        };
+
+        std::multiset<std::shared_ptr<ExecutionPlan>, decltype(plan_cmp)> timed_execution_plans(plan_cmp);
+
+        const int maxIterCount         = 100;
+        const float threshhold         = 0.95f;
+        uint64_t successful_plan_count = 0;
+        cudaEvent_t start, stop;
+        detail::cuda_event_create(&start);
+        detail::cuda_event_create(&stop);
+        detail::cuda_device_synchronize();
+
+        cudaStream_t stream = nullptr;
+        detail::get_stream(handle, &stream);
+
+        for (auto plan : execution_plans) {
+            float time_ms       = 0.0f;
+            float final_time_ms = 0.0f;
+            float min_time_ms   = std::numeric_limits<float>::max();
+
+            // Warm-up run
+            CHECK_CUDNN_FRONTEND_ERROR(detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr));
+            successful_plan_count++;
+            detail::cuda_device_synchronize();
+
+            for (int i = 0; i < maxIterCount; i++) {
+                detail::cuda_event_record(start, stream);
+
+                auto status = detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr);
+
+                detail::cuda_event_record(stop, stream);
+                detail::cuda_event_synchronize(stop);
+                detail::cuda_event_elapsed_time(&time_ms, start, stop);
+
+                final_time_ms = std::min(min_time_ms, time_ms);
+                if (time_ms / min_time_ms < threshhold) {
+                    min_time_ms = final_time_ms;
+                } else {
+                    break;
+                }
+            }
+
+            CUDNN_FE_LOG_LABEL_ENDL("Plan " << plan->getTag() << " took " << std::setw(10) << final_time_ms);
+            plan->setExecutionTime(final_time_ms);
+            timed_execution_plans.insert(plan);
+        }
+
+        execution_plans.clear();
+        for (auto sorted_plan : timed_execution_plans) {
+            execution_plans.push_back(sorted_plan);
+        }
+
+        detail::cuda_event_destroy(start);
+        detail::cuda_event_destroy(stop);
+
+        CUDNN_FE_LOG_LABEL_ENDL("Autotuned " << successful_plan_count << " plans.");
+        return {error_code_t::OK, ""};
+    }
+
+    std::function<error_t(std::vector<std::shared_ptr<ExecutionPlan>>&,
+                          cudnnHandle_t,
+                          std::unordered_map<int64_t, void*> const&,
+                          void*,
+                          void*)>
+        autotune_impl = &Execution_plan_list::autotune_default_impl;
+
+    error_t
+    autotune(cudnnHandle_t handle,
+             std::unordered_map<int64_t, void*> const& tensor_to_pointer_map,
+             void* workspace,
+             void* user_impl = nullptr) {
+        auto error = autotune_impl(execution_plans, handle, tensor_to_pointer_map, workspace, user_impl);
+        return error;
+    }
+
+    error_t
+    is_plan_index_executable(int64_t const index) const {
+        RETURN_CUDNN_FRONTEND_ERROR_IF((index < 0) || (static_cast<int64_t>(execution_plans.size()) <= index),
+                                       error_code_t::GRAPH_EXECUTION_FAILED,
+                                       "Plan index " + std::to_string(index) + " is invalid.");
+
+        RETURN_CUDNN_FRONTEND_ERROR_IF(execution_plans[index] == nullptr,
+                                       error_code_t::GRAPH_EXECUTION_FAILED,
+                                       "Plan index " + std::to_string(index) + " did not build.");
+
+        return {error_code_t::OK, ""};
+    }
+};
+
+}  // namespace graph
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/thirdparty/nlohmann/LICENSE.MIT b/third_party/cudnn-frontend/include/cudnn_frontend/thirdparty/nlohmann/LICENSE.MIT
new file mode 100644
index 00000000..1c1f7a69
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/thirdparty/nlohmann/LICENSE.MIT
@@ -0,0 +1,21 @@
+MIT License 
+
+Copyright (c) 2013-2022 Niels Lohmann
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/thirdparty/nlohmann/json.hpp b/third_party/cudnn-frontend/include/cudnn_frontend/thirdparty/nlohmann/json.hpp
new file mode 100644
index 00000000..85b4cdd3
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/thirdparty/nlohmann/json.hpp
@@ -0,0 +1,26710 @@
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+/****************************************************************************\
+ * Note on documentation: The source files contain links to the online      *
+ * documentation of the public API at https://json.nlohmann.me. This URL    *
+ * contains the most recent documentation and should also be applicable to  *
+ * previous versions; documentation for deprecated functions is not         *
+ * removed, but marked deprecated. See "Generate documentation" section in  *
+ * file docs/README.md.                                                     *
+\****************************************************************************/
+
+#ifndef INCLUDE_NLOHMANN_JSON_HPP_
+#define INCLUDE_NLOHMANN_JSON_HPP_
+
+#include <algorithm>         // all_of, find, for_each
+#include <cstddef>           // nullptr_t, ptrdiff_t, size_t
+#include <functional>        // hash, less
+#include <initializer_list>  // initializer_list
+#ifndef JSON_NO_IO
+#include <iosfwd>    // istream, ostream
+#endif               // JSON_NO_IO
+#include <iterator>  // random_access_iterator_tag
+#include <memory>    // unique_ptr
+#include <string>    // string, stoi, to_string
+#include <utility>   // declval, forward, move, pair, swap
+#include <vector>    // vector
+
+// #include <nlohmann/adl_serializer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <utility>
+
+// #include <nlohmann/detail/abi_macros.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// This file contains all macro definitions affecting or depending on the ABI
+
+#ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
+#if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
+#if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3
+#warning "Already included a different version of the library!"
+#endif
+#endif
+#endif
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_MINOR 11  // NOLINT(modernize-macro-to-enum)
+#define NLOHMANN_JSON_VERSION_PATCH 3   // NOLINT(modernize-macro-to-enum)
+
+#ifndef JSON_DIAGNOSTICS
+#define JSON_DIAGNOSTICS 0
+#endif
+
+#ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+#define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
+#endif
+
+#if JSON_DIAGNOSTICS
+#define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
+#else
+#define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
+#endif
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+#define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
+#else
+#define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
+#define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
+#endif
+
+// Construct the namespace ABI tags component
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b) json_abi##a##b
+#define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b) NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b)
+
+#define NLOHMANN_JSON_ABI_TAGS                                       \
+    NLOHMANN_JSON_ABI_TAGS_CONCAT(NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS, \
+                                  NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON)
+
+// Construct the namespace version component
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) _v##major##_##minor##_##patch
+#define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
+
+#if NLOHMANN_JSON_NAMESPACE_NO_VERSION
+#define NLOHMANN_JSON_NAMESPACE_VERSION
+#else
+#define NLOHMANN_JSON_NAMESPACE_VERSION     \
+    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT( \
+        NLOHMANN_JSON_VERSION_MAJOR, NLOHMANN_JSON_VERSION_MINOR, NLOHMANN_JSON_VERSION_PATCH)
+#endif
+
+// Combine namespace components
+#define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a##b
+#define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
+
+#ifndef NLOHMANN_JSON_NAMESPACE
+#define NLOHMANN_JSON_NAMESPACE \
+    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT(NLOHMANN_JSON_ABI_TAGS, NLOHMANN_JSON_NAMESPACE_VERSION)
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
+#define NLOHMANN_JSON_NAMESPACE_BEGIN \
+    namespace nlohmann {              \
+    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT(NLOHMANN_JSON_ABI_TAGS, NLOHMANN_JSON_NAMESPACE_VERSION) {
+#endif
+
+#ifndef NLOHMANN_JSON_NAMESPACE_END
+#define NLOHMANN_JSON_NAMESPACE_END                                    \
+    } /* namespace (inline namespace) NOLINT(readability/namespace) */ \
+    }  // namespace nlohmann
+#endif
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>      // transform
+#include <array>          // array
+#include <forward_list>   // forward_list
+#include <iterator>       // inserter, front_inserter, end
+#include <map>            // map
+#include <string>         // string
+#include <tuple>          // tuple, make_tuple
+#include <type_traits>    // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map>  // unordered_map
+#include <utility>        // pair, declval
+#include <valarray>       // valarray
+
+// #include <nlohmann/detail/exceptions.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstddef>    // nullptr_t
+#include <exception>  // exception
+#if JSON_DIAGNOSTICS
+#include <numeric>  // accumulate
+#endif
+#include <stdexcept>  // runtime_error
+#include <string>     // to_string
+#include <vector>     // vector
+
+// #include <nlohmann/detail/value_t.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <array>    // array
+#include <cstddef>  // size_t
+#include <cstdint>  // uint8_t
+#include <string>   // string
+
+// #include <nlohmann/detail/macro_scope.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <utility>  // declval, pair
+// #include <nlohmann/detail/meta/detected.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename... Ts>
+struct make_void {
+    using type = void;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+// https://en.cppreference.com/w/cpp/experimental/is_detected
+struct nonesuch {
+    nonesuch()                 = delete;
+    ~nonesuch()                = delete;
+    nonesuch(nonesuch const&)  = delete;
+    nonesuch(nonesuch const&&) = delete;
+    void
+    operator=(nonesuch const&) = delete;
+    void
+    operator=(nonesuch&&) = delete;
+};
+
+template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
+struct detector {
+    using value_t = std::false_type;
+    using type    = Default;
+};
+
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+    using value_t = std::true_type;
+    using type    = Op<Args...>;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+struct is_detected_lazy : is_detected<Op, Args...> {};
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible = std::is_convertible<detected_t<Op, Args...>, To>;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/thirdparty/hedley/hedley.hpp>
+
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2016-2021 Evan Nemerson <evan@nemerson.com>
+// SPDX-License-Identifier: MIT
+
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <evan@nemerson.com>
+ */
+
+#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15)
+#if defined(JSON_HEDLEY_VERSION)
+#undef JSON_HEDLEY_VERSION
+#endif
+#define JSON_HEDLEY_VERSION 15
+
+#if defined(JSON_HEDLEY_STRINGIFY_EX)
+#undef JSON_HEDLEY_STRINGIFY_EX
+#endif
+#define JSON_HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(JSON_HEDLEY_STRINGIFY)
+#undef JSON_HEDLEY_STRINGIFY
+#endif
+#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
+
+#if defined(JSON_HEDLEY_CONCAT_EX)
+#undef JSON_HEDLEY_CONCAT_EX
+#endif
+#define JSON_HEDLEY_CONCAT_EX(a, b) a##b
+
+#if defined(JSON_HEDLEY_CONCAT)
+#undef JSON_HEDLEY_CONCAT
+#endif
+#define JSON_HEDLEY_CONCAT(a, b) JSON_HEDLEY_CONCAT_EX(a, b)
+
+#if defined(JSON_HEDLEY_CONCAT3_EX)
+#undef JSON_HEDLEY_CONCAT3_EX
+#endif
+#define JSON_HEDLEY_CONCAT3_EX(a, b, c) a##b##c
+
+#if defined(JSON_HEDLEY_CONCAT3)
+#undef JSON_HEDLEY_CONCAT3
+#endif
+#define JSON_HEDLEY_CONCAT3(a, b, c) JSON_HEDLEY_CONCAT3_EX(a, b, c)
+
+#if defined(JSON_HEDLEY_VERSION_ENCODE)
+#undef JSON_HEDLEY_VERSION_ENCODE
+#endif
+#define JSON_HEDLEY_VERSION_ENCODE(major, minor, revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+#undef JSON_HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+#define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+#define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+#define JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION)
+#undef JSON_HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
+#define JSON_HEDLEY_MSVC_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE(  \
+        _MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER) && !defined(__ICL)
+#define JSON_HEDLEY_MSVC_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER) && !defined(__ICL)
+#define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(JSON_HEDLEY_MSVC_VERSION)
+#define JSON_HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+#define JSON_HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
+    (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+#define JSON_HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
+    (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+#define JSON_HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+#undef JSON_HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
+#define JSON_HEDLEY_INTEL_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER) && !defined(__ICL)
+#define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+#define JSON_HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
+#undef JSON_HEDLEY_INTEL_CL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
+#define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK)
+#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
+#define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION)
+#undef JSON_HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
+#define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PGI_VERSION)
+#define JSON_HEDLEY_PGI_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_PGI_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+#define JSON_HEDLEY_SUNPRO_VERSION                                                             \
+    JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), \
+                               (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf),   \
+                               (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+#define JSON_HEDLEY_SUNPRO_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+#define JSON_HEDLEY_SUNPRO_VERSION                                                               \
+    JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), \
+                               (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf),   \
+                               (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+#define JSON_HEDLEY_SUNPRO_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+#define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+#define JSON_HEDLEY_EMSCRIPTEN_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+#define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION)
+#undef JSON_HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+#define JSON_HEDLEY_ARM_VERSION                                           \
+    JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000,           \
+                               (__ARMCOMPILER_VERSION % 1000000) / 10000, \
+                               (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+#define JSON_HEDLEY_ARM_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE( \
+        __ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_ARM_VERSION)
+#define JSON_HEDLEY_ARM_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_ARM_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION)
+#undef JSON_HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+#define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+#define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+#define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IBM_VERSION)
+#define JSON_HEDLEY_IBM_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_IBM_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION)
+#undef JSON_HEDLEY_TI_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && \
+    (defined(__TMS470__) || defined(__TI_ARM__) || defined(__MSP430__) || defined(__TMS320C2000__))
+#if (__TI_COMPILER_VERSION__ >= 16000000)
+#define JSON_HEDLEY_TI_VERSION                                             \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_VERSION)
+#define JSON_HEDLEY_TI_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+#undef JSON_HEDLEY_TI_CL2000_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
+#define JSON_HEDLEY_TI_CL2000_VERSION                                      \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+#define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+#undef JSON_HEDLEY_TI_CL430_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
+#define JSON_HEDLEY_TI_CL430_VERSION                                       \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+#define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+#undef JSON_HEDLEY_TI_ARMCL_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
+#define JSON_HEDLEY_TI_ARMCL_VERSION                                       \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+#define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+#undef JSON_HEDLEY_TI_CL6X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
+#define JSON_HEDLEY_TI_CL6X_VERSION                                        \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+#define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+#undef JSON_HEDLEY_TI_CL7X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
+#define JSON_HEDLEY_TI_CL7X_VERSION                                        \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+#define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+#undef JSON_HEDLEY_TI_CLPRU_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
+#define JSON_HEDLEY_TI_CLPRU_VERSION                                       \
+    JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000,          \
+                               (__TI_COMPILER_VERSION__ % 1000000) / 1000, \
+                               (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
+#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+#define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+#undef JSON_HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+#if defined(_RELEASE_PATCHLEVEL)
+#define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
+#else
+#define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+#define JSON_HEDLEY_CRAY_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_CRAY_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION)
+#undef JSON_HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+#if __VER__ > 1000
+#define JSON_HEDLEY_IAR_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
+#else
+#define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IAR_VERSION)
+#define JSON_HEDLEY_IAR_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_IAR_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+#undef JSON_HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+#define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+#define JSON_HEDLEY_TINYC_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_TINYC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION)
+#undef JSON_HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+#define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_DMC_VERSION)
+#define JSON_HEDLEY_DMC_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_DMC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+#define JSON_HEDLEY_COMPCERT_VERSION \
+    JSON_HEDLEY_VERSION_ENCODE(      \
+        __COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+#define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+#undef JSON_HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+#define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+#define JSON_HEDLEY_PELLES_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_PELLES_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
+#undef JSON_HEDLEY_MCST_LCC_VERSION
+#endif
+#if defined(__LCC__) && defined(__LCC_MINOR__)
+#define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
+#endif
+
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK)
+#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
+#define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION)
+#undef JSON_HEDLEY_GCC_VERSION
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION) && !defined(__clang__) && !defined(JSON_HEDLEY_INTEL_VERSION) &&              \
+    !defined(JSON_HEDLEY_PGI_VERSION) && !defined(JSON_HEDLEY_ARM_VERSION) && !defined(JSON_HEDLEY_CRAY_VERSION) && \
+    !defined(JSON_HEDLEY_TI_VERSION) && !defined(JSON_HEDLEY_TI_ARMCL_VERSION) &&                                   \
+    !defined(JSON_HEDLEY_TI_CL430_VERSION) && !defined(JSON_HEDLEY_TI_CL2000_VERSION) &&                            \
+    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && !defined(JSON_HEDLEY_TI_CL7X_VERSION) &&                               \
+    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && !defined(__COMPCERT__) && !defined(JSON_HEDLEY_MCST_LCC_VERSION)
+#define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GCC_VERSION)
+#define JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch) \
+    (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+#define JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute) && ((!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8, 5, 9)))
+#define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+#define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+#define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute, major, minor, patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+#else
+#define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute, major, minor, patch) \
+    JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+#define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute, major, minor, patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+#else
+#define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute, major, minor, patch) JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0))
+#define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+#define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+#define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns, attribute) (0)
+#elif !defined(JSON_HEDLEY_PGI_VERSION) && !defined(JSON_HEDLEY_IAR_VERSION) &&             \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0)) && \
+    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19, 20, 0))
+#define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns, attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+#define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns, attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+#define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) __has_cpp_attribute(attribute)
+#else
+#define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \
+    JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+#define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) __has_cpp_attribute(attribute)
+#else
+#define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute, major, minor, patch) \
+    JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_BUILTIN)
+#undef JSON_HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+#define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+#define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+#define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin, major, minor, patch) __has_builtin(builtin)
+#else
+#define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin, major, minor, patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+#define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin, major, minor, patch) __has_builtin(builtin)
+#else
+#define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin, major, minor, patch) JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_FEATURE)
+#undef JSON_HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+#define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+#define JSON_HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+#define JSON_HEDLEY_GNUC_HAS_FEATURE(feature, major, minor, patch) __has_feature(feature)
+#else
+#define JSON_HEDLEY_GNUC_HAS_FEATURE(feature, major, minor, patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+#define JSON_HEDLEY_GCC_HAS_FEATURE(feature, major, minor, patch) __has_feature(feature)
+#else
+#define JSON_HEDLEY_GCC_HAS_FEATURE(feature, major, minor, patch) JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_EXTENSION)
+#undef JSON_HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+#define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+#define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+#define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension, major, minor, patch) __has_extension(extension)
+#else
+#define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension, major, minor, patch) \
+    JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+#define JSON_HEDLEY_GCC_HAS_EXTENSION(extension, major, minor, patch) __has_extension(extension)
+#else
+#define JSON_HEDLEY_GCC_HAS_EXTENSION(extension, major, minor, patch) JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+#define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
+#else
+#define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+#define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) __has_declspec_attribute(attribute)
+#else
+#define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \
+    JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+#define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) __has_declspec_attribute(attribute)
+#else
+#define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute, major, minor, patch) \
+    JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_WARNING)
+#undef JSON_HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+#define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+#define JSON_HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+#define JSON_HEDLEY_GNUC_HAS_WARNING(warning, major, minor, patch) __has_warning(warning)
+#else
+#define JSON_HEDLEY_GNUC_HAS_WARNING(warning, major, minor, patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+#define JSON_HEDLEY_GCC_HAS_WARNING(warning, major, minor, patch) __has_warning(warning)
+#else
+#define JSON_HEDLEY_GCC_HAS_WARNING(warning, major, minor, patch) JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || defined(__clang__) ||         \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3, 0, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||        \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || JSON_HEDLEY_PGI_VERSION_CHECK(18, 4, 0) ||          \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||          \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) || JSON_HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 0, 0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||  \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) || JSON_HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) ||       \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) ||                                                  \
+    (JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR))
+#define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define JSON_HEDLEY_PRAGMA(value) __pragma(value)
+#else
+#define JSON_HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+#define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+#define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+#define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+#define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif JSON_HEDLEY_ARM_VERSION_CHECK(5, 6, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+#define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) || JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||  \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 4, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+#define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+#define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_PUSH
+#define JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+
+/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus)
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
+#if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions")
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                                                        \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                                                                                       \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+        _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") xpr JSON_HEDLEY_DIAGNOSTIC_POP
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                                                        \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                                                                                       \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+        xpr JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                                \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") xpr JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#endif
+#endif
+#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if defined(JSON_HEDLEY_CONST_CAST)
+#undef JSON_HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || JSON_HEDLEY_GCC_VERSION_CHECK(4, 6, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_CONST_CAST(T, expr)                      \
+    (__extension__({                                         \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH                          \
+        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL((T)(expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP                           \
+    }))
+#else
+#define JSON_HEDLEY_CONST_CAST(T, expr) ((T)(expr))
+#endif
+
+#if defined(JSON_HEDLEY_REINTERPRET_CAST)
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+#define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+#define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T)(expr))
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_CAST)
+#undef JSON_HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+#define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+#define JSON_HEDLEY_STATIC_CAST(T, expr) ((T)(expr))
+#endif
+
+#if defined(JSON_HEDLEY_CPP_CAST)
+#undef JSON_HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
+#define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH       \
+    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"")((T)(expr)) JSON_HEDLEY_DIAGNOSTIC_POP
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 3, 0)
+#define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH       \
+    _Pragma("diag_suppress=Pe137") JSON_HEDLEY_DIAGNOSTIC_POP
+#else
+#define JSON_HEDLEY_CPP_CAST(T, expr) ((T)(expr))
+#endif
+#else
+#define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable : 1478 1786))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(20, 7, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable : 4996))
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                                \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && !defined(__cplusplus)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) && defined(__cplusplus)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable : 161))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable : 4068))
+#elif JSON_HEDLEY_TI_VERSION_CHECK(16, 9, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
+    _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable : 1292))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable : 5030))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(20, 7, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif JSON_HEDLEY_TI_VERSION_CHECK(18, 1, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 3, 0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunused-function")
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1, 0, 0)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable : 4505))
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
+#else
+#define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+
+#if defined(JSON_HEDLEY_DEPRECATED)
+#undef JSON_HEDLEY_DEPRECATED
+#endif
+#if defined(JSON_HEDLEY_DEPRECATED_FOR)
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since))
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
+#elif (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                   \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5, 6, 0) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) ||                  \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || JSON_HEDLEY_TI_VERSION_CHECK(18, 1, 0) ||                    \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18, 1, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 3, 0) ||            \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 0) ||             \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) \
+    __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+#define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
+#elif JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||            \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||            \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||       \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                                  \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||      \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                                 \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||       \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                                  \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||        \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||     \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8, 10, 0)
+#define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || JSON_HEDLEY_PELLES_VERSION_CHECK(6, 50, 0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+#define JSON_HEDLEY_DEPRECATED(since)
+#define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(JSON_HEDLEY_UNAVAILABLE)
+#undef JSON_HEDLEY_UNAVAILABLE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(warning) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 3, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
+#else
+#define JSON_HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||  \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||     \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||                                              \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) ||                     \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
+#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+#define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+#define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#elif defined(_Check_return_) /* SAL */
+#define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
+#define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
+#else
+#define JSON_HEDLEY_WARN_UNUSED_RESULT
+#define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
+#endif
+
+#if defined(JSON_HEDLEY_SENTINEL)
+#undef JSON_HEDLEY_SENTINEL
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 0, 0) ||       \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_ARM_VERSION_CHECK(5, 4, 0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+#define JSON_HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(JSON_HEDLEY_NO_RETURN)
+#undef JSON_HEDLEY_NO_RETURN
+#endif
+#if JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_NO_RETURN __noreturn
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define JSON_HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+#define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
+#elif JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 2, 0) ||          \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||     \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||       \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_IAR_VERSION_CHECK(8, 10, 0)
+#define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
+#define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0)
+#define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9, 0, 0)
+#define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+#define JSON_HEDLEY_NO_RETURN
+#endif
+
+#if defined(JSON_HEDLEY_NO_ESCAPE)
+#undef JSON_HEDLEY_NO_ESCAPE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
+#define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+#define JSON_HEDLEY_NO_ESCAPE
+#endif
+
+#if defined(JSON_HEDLEY_UNREACHABLE)
+#undef JSON_HEDLEY_UNREACHABLE
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#endif
+#if defined(JSON_HEDLEY_ASSUME)
+#undef JSON_HEDLEY_ASSUME
+#endif
+#if JSON_HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_ASSUME(expr) __assume(expr)
+#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
+#define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4, 0, 0)
+#if defined(__cplusplus)
+#define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
+#else
+#define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
+#endif
+#endif
+#if (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4, 5, 0) || JSON_HEDLEY_PGI_VERSION_CHECK(18, 10, 0) ||      \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(13, 1, 5) ||    \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(10, 0, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif defined(JSON_HEDLEY_ASSUME)
+#define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+#if !defined(JSON_HEDLEY_ASSUME)
+#if defined(JSON_HEDLEY_UNREACHABLE)
+#define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
+#else
+#define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
+#endif
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE)
+#if JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4, 0, 0)
+#define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
+#else
+#define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
+#endif
+#else
+#define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
+#endif
+#if !defined(JSON_HEDLEY_UNREACHABLE)
+#define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+
+JSON_HEDLEY_DIAGNOSTIC_PUSH
+#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
+#pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros", 4, 0, 0)
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wvariadic-macros"
+#elif defined(JSON_HEDLEY_GCC_VERSION)
+#pragma GCC diagnostic ignored "-Wvariadic-macros"
+#endif
+#endif
+#if defined(JSON_HEDLEY_NON_NULL)
+#undef JSON_HEDLEY_NON_NULL
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
+#define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+#define JSON_HEDLEY_NON_NULL(...)
+#endif
+JSON_HEDLEY_DIAGNOSTIC_POP
+
+#if defined(JSON_HEDLEY_PRINTF_FORMAT)
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format, 4, 4, 0) && !defined(__USE_MINGW_ANSI_STDIO)
+#define JSON_HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+    __attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format, 4, 4, 0) && defined(__USE_MINGW_ANSI_STDIO)
+#define JSON_HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+    __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif JSON_HEDLEY_HAS_ATTRIBUTE(format) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||            \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_ARM_VERSION_CHECK(5, 6, 0) ||      \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||       \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) \
+    __attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6, 0, 0)
+#define JSON_HEDLEY_PRINTF_FORMAT(string_idx, first_to_check) __declspec(vaformat(printf, string_idx, first_to_check))
+#else
+#define JSON_HEDLEY_PRINTF_FORMAT(string_idx, first_to_check)
+#endif
+
+#if defined(JSON_HEDLEY_CONSTEXPR)
+#undef JSON_HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+#define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
+#endif
+#endif
+#if !defined(JSON_HEDLEY_CONSTEXPR)
+#define JSON_HEDLEY_CONSTEXPR
+#endif
+
+#if defined(JSON_HEDLEY_PREDICT)
+#undef JSON_HEDLEY_PREDICT
+#endif
+#if defined(JSON_HEDLEY_LIKELY)
+#undef JSON_HEDLEY_LIKELY
+#endif
+#if defined(JSON_HEDLEY_UNLIKELY)
+#undef JSON_HEDLEY_UNLIKELY
+#endif
+#if defined(JSON_HEDLEY_UNPREDICTABLE)
+#undef JSON_HEDLEY_UNPREDICTABLE
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+#define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
+#endif
+#if (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(9, 0, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability((expr), (value), (probability))
+#define JSON_HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1, (probability))
+#define JSON_HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0, (probability))
+#define JSON_HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#elif (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) ||                        \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3, 0, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) || JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                             \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) || JSON_HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) ||                     \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) ||                     \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||                      \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0, 9, 27) || JSON_HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) ||                           \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_PREDICT(expr, expected, probability) \
+    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
+#define JSON_HEDLEY_PREDICT_TRUE(expr, probability)                                                                 \
+    (__extension__({                                                                                                \
+        double hedley_probability_ = (probability);                                                                 \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1)                                               \
+                                      : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
+    }))
+#define JSON_HEDLEY_PREDICT_FALSE(expr, probability)                                                                \
+    (__extension__({                                                                                                \
+        double hedley_probability_ = (probability);                                                                 \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0)                                               \
+                                      : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
+    }))
+#define JSON_HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
+#define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#define JSON_HEDLEY_LIKELY(expr) (!!(expr))
+#define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(JSON_HEDLEY_UNPREDICTABLE)
+#define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(JSON_HEDLEY_MALLOC)
+#undef JSON_HEDLEY_MALLOC
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||              \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||  \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(12, 1, 0) ||        \
+    JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                                  \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_MALLOC __declspec(restrict)
+#else
+#define JSON_HEDLEY_MALLOC
+#endif
+
+#if defined(JSON_HEDLEY_PURE)
+#undef JSON_HEDLEY_PURE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(pure) || JSON_HEDLEY_GCC_VERSION_CHECK(2, 96, 0) ||               \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||  \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||        \
+    JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                                  \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||  \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_PURE __attribute__((__pure__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif defined(__cplusplus) &&                                                                     \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4, 0, 0) || \
+     JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0))
+#define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+#define JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_CONST)
+#undef JSON_HEDLEY_CONST
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(const) || JSON_HEDLEY_GCC_VERSION_CHECK(2, 5, 0) ||               \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||  \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||        \
+    JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                                  \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||  \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                             \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||   \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                              \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||    \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||  \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_CONST __attribute__((__const__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
+#define JSON_HEDLEY_CONST _Pragma("no_side_effect")
+#else
+#define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_RESTRICT)
+#undef JSON_HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+#define JSON_HEDLEY_RESTRICT restrict
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || JSON_HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) ||                           \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) ||                    \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17, 10, 0) || JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                        \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 4) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) ||                     \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                                                                     \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 14, 0) && defined(__cplusplus)) || JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0) || \
+    defined(__clang__) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_RESTRICT __restrict
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 3, 0) && !defined(__cplusplus)
+#define JSON_HEDLEY_RESTRICT _Restrict
+#else
+#define JSON_HEDLEY_RESTRICT
+#endif
+
+#if defined(JSON_HEDLEY_INLINE)
+#undef JSON_HEDLEY_INLINE
+#endif
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || (defined(__cplusplus) && (__cplusplus >= 199711L))
+#define JSON_HEDLEY_INLINE inline
+#elif defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_ARM_VERSION_CHECK(6, 2, 0)
+#define JSON_HEDLEY_INLINE __inline__
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 1, 0) ||        \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) || JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) ||  \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||     \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_INLINE __inline
+#else
+#define JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_ALWAYS_INLINE)
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 0, 0) ||           \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||      \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||            \
+    JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                                      \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||       \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                                  \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||      \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                                 \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||       \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                                  \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||        \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||     \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8, 10, 0)
+#define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_ALWAYS_INLINE __forceinline
+#elif defined(__cplusplus) &&                                                                      \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) || \
+     JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) || JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) || \
+     JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) || JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0))
+#define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+#define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_NEVER_INLINE)
+#undef JSON_HEDLEY_NEVER_INLINE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 0, 0) ||                \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) ||      \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||            \
+    JSON_HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                                      \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4, 8, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||       \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||                                                  \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||      \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6, 4, 0) ||                                                 \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 0, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||       \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||                                                  \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) ||        \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0) || JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||     \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8, 10, 0)
+#define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(10, 2, 0)
+#define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
+#define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3, 2, 0)
+#define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9, 0, 0)
+#define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+#define JSON_HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_PRIVATE)
+#undef JSON_HEDLEY_PRIVATE
+#endif
+#if defined(JSON_HEDLEY_PUBLIC)
+#undef JSON_HEDLEY_PUBLIC
+#endif
+#if defined(JSON_HEDLEY_IMPORT)
+#undef JSON_HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+#define JSON_HEDLEY_PRIVATE
+#define JSON_HEDLEY_PUBLIC __declspec(dllexport)
+#define JSON_HEDLEY_IMPORT __declspec(dllimport)
+#else
+#if JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 3, 0) ||                                 \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 11, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                         \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||                               \
+    (defined(__TI_EABI__) && ((JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 2, 0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+                              JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7, 5, 0))) ||                                          \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+#define JSON_HEDLEY_PUBLIC __attribute__((__visibility__("default")))
+#else
+#define JSON_HEDLEY_PRIVATE
+#define JSON_HEDLEY_PUBLIC
+#endif
+#define JSON_HEDLEY_IMPORT extern
+#endif
+
+#if defined(JSON_HEDLEY_NO_THROW)
+#undef JSON_HEDLEY_NO_THROW
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
+#define JSON_HEDLEY_NO_THROW __declspec(nothrow)
+#else
+#define JSON_HEDLEY_NO_THROW
+#endif
+
+#if defined(JSON_HEDLEY_FALL_THROUGH)
+#undef JSON_HEDLEY_FALL_THROUGH
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || JSON_HEDLEY_GCC_VERSION_CHECK(7, 0, 0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang, fallthrough)
+#define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+#define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+#define JSON_HEDLEY_FALL_THROUGH __fallthrough
+#else
+#define JSON_HEDLEY_FALL_THROUGH
+#endif
+
+#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 9, 0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+#define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+#define JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(JSON_HEDLEY_ARRAY_PARAM)
+#undef JSON_HEDLEY_ARRAY_PARAM
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__STDC_NO_VLA__) && \
+    !defined(__cplusplus) && !defined(JSON_HEDLEY_PGI_VERSION) && !defined(JSON_HEDLEY_TINYC_VERSION)
+#define JSON_HEDLEY_ARRAY_PARAM(name) (name)
+#else
+#define JSON_HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(JSON_HEDLEY_IS_CONSTANT)
+#undef JSON_HEDLEY_IS_CONSTANT
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* JSON_HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_TINYC_VERSION_CHECK(0, 9, 19) ||  \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||       \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6, 1, 0) ||                                              \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0) && !defined(__cplusplus)) ||                   \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) || JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1, 25, 10)
+#define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || JSON_HEDLEY_GCC_VERSION_CHECK(3, 4, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || JSON_HEDLEY_IBM_VERSION_CHECK(13, 1, 0) ||            \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8, 1, 0) || JSON_HEDLEY_ARM_VERSION_CHECK(5, 4, 0) ||               \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0, 9, 24)
+#if defined(__INTPTR_TYPE__)
+#define JSON_HEDLEY_IS_CONSTEXPR_(expr) \
+    __builtin_types_compatible_p(__typeof__((1 ? (void*)((__INTPTR_TYPE__)((expr) * 0)) : (int*)0)), int*)
+#else
+#include <stdint.h>
+#define JSON_HEDLEY_IS_CONSTEXPR_(expr) \
+    __builtin_types_compatible_p(__typeof__((1 ? (void*)((intptr_t)((expr) * 0)) : (int*)0)), int*)
+#endif
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
+       !defined(JSON_HEDLEY_PGI_VERSION) && !defined(JSON_HEDLEY_IAR_VERSION)) ||                            \
+    (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) ||                \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4, 9, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(17, 0, 0) ||                   \
+    JSON_HEDLEY_IBM_VERSION_CHECK(12, 1, 0) || JSON_HEDLEY_ARM_VERSION_CHECK(5, 3, 0)
+#if defined(__INTPTR_TYPE__)
+#define JSON_HEDLEY_IS_CONSTEXPR_(expr) \
+    _Generic((1 ? (void*)((__INTPTR_TYPE__)((expr) * 0)) : (int*)0), int*: 1, void*: 0)
+#else
+#include <stdint.h>
+#define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*)((intptr_t)*0) : (int*)0), int*: 1, void*: 0)
+#endif
+#elif defined(JSON_HEDLEY_GCC_VERSION) || defined(JSON_HEDLEY_INTEL_VERSION) || defined(JSON_HEDLEY_TINYC_VERSION) || \
+    defined(JSON_HEDLEY_TI_ARMCL_VERSION) || JSON_HEDLEY_TI_CL430_VERSION_CHECK(18, 12, 0) ||                         \
+    defined(JSON_HEDLEY_TI_CL2000_VERSION) || defined(JSON_HEDLEY_TI_CL6X_VERSION) ||                                 \
+    defined(JSON_HEDLEY_TI_CL7X_VERSION) || defined(JSON_HEDLEY_TI_CLPRU_VERSION) || defined(__clang__)
+#define JSON_HEDLEY_IS_CONSTEXPR_(expr) \
+    (sizeof(void) != sizeof(*(1 ? ((void*)((expr) * 0L)) : ((struct { char v[sizeof(void) * 2]; }*)1))))
+#endif
+#endif
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+#if !defined(JSON_HEDLEY_IS_CONSTANT)
+#define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
+#endif
+#define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
+#else
+#if !defined(JSON_HEDLEY_IS_CONSTANT)
+#define JSON_HEDLEY_IS_CONSTANT(expr) (0)
+#endif
+#define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_END_C_DECLS)
+#undef JSON_HEDLEY_END_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_C_DECL)
+#undef JSON_HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+#define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
+#define JSON_HEDLEY_END_C_DECLS }
+#define JSON_HEDLEY_C_DECL extern "C"
+#else
+#define JSON_HEDLEY_BEGIN_C_DECLS
+#define JSON_HEDLEY_END_C_DECLS
+#define JSON_HEDLEY_C_DECL
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_ASSERT)
+#undef JSON_HEDLEY_STATIC_ASSERT
+#endif
+#if !defined(__cplusplus) &&                                                                 \
+    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) ||                         \
+     (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
+     JSON_HEDLEY_GCC_VERSION_CHECK(6, 0, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
+#define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || JSON_HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_STATIC_ASSERT(expr, message) \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
+#else
+#define JSON_HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(JSON_HEDLEY_NULL)
+#undef JSON_HEDLEY_NULL
+#endif
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+#define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+#elif defined(NULL)
+#define JSON_HEDLEY_NULL NULL
+#else
+#define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
+#endif
+#elif defined(NULL)
+#define JSON_HEDLEY_NULL NULL
+#else
+#define JSON_HEDLEY_NULL ((void*)0)
+#endif
+
+#if defined(JSON_HEDLEY_MESSAGE)
+#undef JSON_HEDLEY_MESSAGE
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#define JSON_HEDLEY_MESSAGE(msg)                   \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                    \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(message msg)                \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4, 4, 0) || JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
+#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
+#define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
+#define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2, 0, 0)
+#define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#define JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_WARNING)
+#undef JSON_HEDLEY_WARNING
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#define JSON_HEDLEY_WARNING(msg)                   \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                    \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(clang warning msg)          \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4, 8, 0) || JSON_HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
+#define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) || JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_REQUIRE)
+#undef JSON_HEDLEY_REQUIRE
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_MSG)
+#undef JSON_HEDLEY_REQUIRE_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
+#define JSON_HEDLEY_REQUIRE(expr)                        \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                          \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+        __attribute__((diagnose_if(!(expr), #expr, "error"))) JSON_HEDLEY_DIAGNOSTIC_POP
+#define JSON_HEDLEY_REQUIRE_MSG(expr, msg)               \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH                          \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+        __attribute__((diagnose_if(!(expr), msg, "error"))) JSON_HEDLEY_DIAGNOSTIC_POP
+#else
+#define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#define JSON_HEDLEY_REQUIRE_MSG(expr, msg) __attribute__((diagnose_if(!(expr), msg, "error")))
+#endif
+#else
+#define JSON_HEDLEY_REQUIRE(expr)
+#define JSON_HEDLEY_REQUIRE_MSG(expr, msg)
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS)
+#undef JSON_HEDLEY_FLAGS
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && \
+    (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
+#define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
+#else
+#define JSON_HEDLEY_FLAGS
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS_CAST)
+#undef JSON_HEDLEY_FLAGS_CAST
+#endif
+#if JSON_HEDLEY_INTEL_VERSION_CHECK(19, 0, 0)
+#define JSON_HEDLEY_FLAGS_CAST(T, expr)             \
+    (__extension__({                                \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH                 \
+        _Pragma("warning(disable:188)")((T)(expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP                  \
+    }))
+#else
+#define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(JSON_HEDLEY_EMPTY_BASES)
+#undef JSON_HEDLEY_EMPTY_BASES
+#endif
+#if (JSON_HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20, 0, 0)) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
+#define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+#define JSON_HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+#define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major, minor, patch) (0)
+#else
+#define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major, minor, patch) JSON_HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
+#endif
+
+#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#endif
+#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
+
+// This file contains all internal macro definitions (except those affecting ABI)
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// exclude unsupported compilers
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+#if defined(__clang__)
+#if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+#error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+#endif
+#elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+#error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+#endif
+#endif
+#endif
+
+// C++ language standard detection
+// if the user manually specified the used c++ version this is skipped
+#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
+#if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define JSON_HAS_CPP_20
+#define JSON_HAS_CPP_17
+#define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201703L) || \
+    (defined(_HAS_CXX17) && _HAS_CXX17 == 1)  // fix for issue #464
+#define JSON_HAS_CPP_17
+#define JSON_HAS_CPP_14
+#elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+#define JSON_HAS_CPP_14
+#endif
+// the cpp 11 flag is always specified because it is the minimal required version
+#define JSON_HAS_CPP_11
+#endif
+
+#ifdef __has_include
+#if __has_include(<version>)
+#include <version>
+#endif
+#endif
+
+#if !defined(JSON_HAS_FILESYSTEM) && !defined(JSON_HAS_EXPERIMENTAL_FILESYSTEM)
+#ifdef JSON_HAS_CPP_17
+#if defined(__cpp_lib_filesystem)
+#define JSON_HAS_FILESYSTEM 1
+#elif defined(__cpp_lib_experimental_filesystem)
+#define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
+#elif !defined(__has_include)
+#define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
+#elif __has_include(<filesystem>)
+#define JSON_HAS_FILESYSTEM 1
+#elif __has_include(<experimental/filesystem>)
+#define JSON_HAS_EXPERIMENTAL_FILESYSTEM 1
+#endif
+
+// std::filesystem does not work on MinGW GCC 8: https://sourceforge.net/p/mingw-w64/bugs/737/
+#if defined(__MINGW32__) && defined(__GNUC__) && __GNUC__ == 8
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#endif
+
+// no filesystem support before GCC 8: https://en.cppreference.com/w/cpp/compiler_support
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#endif
+
+// no filesystem support before Clang 7: https://en.cppreference.com/w/cpp/compiler_support
+#if defined(__clang_major__) && __clang_major__ < 7
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#endif
+
+// no filesystem support before MSVC 19.14: https://en.cppreference.com/w/cpp/compiler_support
+#if defined(_MSC_VER) && _MSC_VER < 1914
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#endif
+
+// no filesystem support before iOS 13
+#if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 130000
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#endif
+
+// no filesystem support before macOS Catalina
+#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101500
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#endif
+#endif
+#endif
+
+#ifndef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#define JSON_HAS_EXPERIMENTAL_FILESYSTEM 0
+#endif
+
+#ifndef JSON_HAS_FILESYSTEM
+#define JSON_HAS_FILESYSTEM 0
+#endif
+
+#ifndef JSON_HAS_THREE_WAY_COMPARISON
+#if defined(__cpp_impl_three_way_comparison) && __cpp_impl_three_way_comparison >= 201907L && \
+    defined(__cpp_lib_three_way_comparison) && __cpp_lib_three_way_comparison >= 201907L
+#define JSON_HAS_THREE_WAY_COMPARISON 1
+#else
+#define JSON_HAS_THREE_WAY_COMPARISON 0
+#endif
+#endif
+
+#ifndef JSON_HAS_RANGES
+// ranges header shipping in GCC 11.1.0 (released 2021-04-27) has syntax error
+#if defined(__GLIBCXX__) && __GLIBCXX__ == 20210427
+#define JSON_HAS_RANGES 0
+#elif defined(__cpp_lib_ranges)
+#define JSON_HAS_RANGES 1
+#else
+#define JSON_HAS_RANGES 0
+#endif
+#endif
+
+#ifndef JSON_HAS_STATIC_RTTI
+#if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0
+#define JSON_HAS_STATIC_RTTI 1
+#else
+#define JSON_HAS_STATIC_RTTI 0
+#endif
+#endif
+
+#ifdef JSON_HAS_CPP_17
+#define JSON_INLINE_VARIABLE inline
+#else
+#define JSON_INLINE_VARIABLE
+#endif
+
+#if JSON_HEDLEY_HAS_ATTRIBUTE(no_unique_address)
+#define JSON_NO_UNIQUE_ADDRESS [[no_unique_address]]
+#else
+#define JSON_NO_UNIQUE_ADDRESS
+#endif
+
+// disable documentation warnings on clang
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdocumentation"
+#pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
+#endif
+
+// allow disabling exceptions
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
+#define JSON_THROW(exception) throw exception
+#define JSON_TRY try
+#define JSON_CATCH(exception) catch (exception)
+#define JSON_INTERNAL_CATCH(exception) catch (exception)
+#else
+#include <cstdlib>
+#define JSON_THROW(exception) std::abort()
+#define JSON_TRY if (true)
+#define JSON_CATCH(exception) if (false)
+#define JSON_INTERNAL_CATCH(exception) if (false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+#undef JSON_THROW
+#define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+#undef JSON_TRY
+#define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+#undef JSON_CATCH
+#define JSON_CATCH JSON_CATCH_USER
+#undef JSON_INTERNAL_CATCH
+#define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+#undef JSON_INTERNAL_CATCH
+#define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
+#endif
+
+// allow overriding assert
+#if !defined(JSON_ASSERT)
+#include <cassert>  // assert
+#define JSON_ASSERT(x) assert(x)
+#endif
+
+// allow to access some private functions (needed by the test suite)
+#if defined(JSON_TESTS_PRIVATE)
+#define JSON_PRIVATE_UNLESS_TESTED public
+#else
+#define JSON_PRIVATE_UNLESS_TESTED private
+#endif
+
+/*!
+@brief macro to briefly define a mapping between an enum and JSON
+@def NLOHMANN_JSON_SERIALIZE_ENUM
+@since version 3.4.0
+*/
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                                                                                    \
+    template <typename BasicJsonType>                                                                                                                   \
+    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e) {                                                                                         \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");                                                                  \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                                                                             \
+        auto it =                                                                                                                                       \
+            std::find_if(std::begin(m), std::end(m), [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool {                                  \
+                return ej_pair.first == e;                                                                                                              \
+            });                                                                                                                                         \
+        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                                                                         \
+    }                                                                                                                                                   \
+    template <typename BasicJsonType>                                                                                                                   \
+    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e) {                                                                                       \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");                                                                  \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                                                                             \
+        auto it                                              = std::find_if(                                                                            \
+            std::begin(m), std::end(m), [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool { \
+                return ej_pair.second == j;                                                                \
+            });                                                                                            \
+        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                                                                          \
+    }
+
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
+
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template <template <typename, typename, typename...> class ObjectType, \
+              template <typename, typename...> class ArrayType,            \
+              class StringType,                                            \
+              class BooleanType,                                           \
+              class NumberIntegerType,                                     \
+              class NumberUnsignedType,                                    \
+              class NumberFloatType,                                       \
+              template <typename> class AllocatorType,                     \
+              template <typename, typename = void> class JSONSerializer,   \
+              class BinaryType,                                            \
+              class CustomBaseClass>
+
+#define NLOHMANN_BASIC_JSON_TPL    \
+    basic_json<ObjectType,         \
+               ArrayType,          \
+               StringType,         \
+               BooleanType,        \
+               NumberIntegerType,  \
+               NumberUnsignedType, \
+               NumberFloatType,    \
+               AllocatorType,      \
+               JSONSerializer,     \
+               BinaryType,         \
+               CustomBaseClass>
+
+// Macros to simplify conversion from/to types
+
+#define NLOHMANN_JSON_EXPAND(x) x
+#define NLOHMANN_JSON_GET_MACRO(_1,   \
+                                _2,   \
+                                _3,   \
+                                _4,   \
+                                _5,   \
+                                _6,   \
+                                _7,   \
+                                _8,   \
+                                _9,   \
+                                _10,  \
+                                _11,  \
+                                _12,  \
+                                _13,  \
+                                _14,  \
+                                _15,  \
+                                _16,  \
+                                _17,  \
+                                _18,  \
+                                _19,  \
+                                _20,  \
+                                _21,  \
+                                _22,  \
+                                _23,  \
+                                _24,  \
+                                _25,  \
+                                _26,  \
+                                _27,  \
+                                _28,  \
+                                _29,  \
+                                _30,  \
+                                _31,  \
+                                _32,  \
+                                _33,  \
+                                _34,  \
+                                _35,  \
+                                _36,  \
+                                _37,  \
+                                _38,  \
+                                _39,  \
+                                _40,  \
+                                _41,  \
+                                _42,  \
+                                _43,  \
+                                _44,  \
+                                _45,  \
+                                _46,  \
+                                _47,  \
+                                _48,  \
+                                _49,  \
+                                _50,  \
+                                _51,  \
+                                _52,  \
+                                _53,  \
+                                _54,  \
+                                _55,  \
+                                _56,  \
+                                _57,  \
+                                _58,  \
+                                _59,  \
+                                _60,  \
+                                _61,  \
+                                _62,  \
+                                _63,  \
+                                _64,  \
+                                NAME, \
+                                ...)  \
+    NAME
+#define NLOHMANN_JSON_PASTE(...)                                        \
+    NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__,           \
+                                                 NLOHMANN_JSON_PASTE64, \
+                                                 NLOHMANN_JSON_PASTE63, \
+                                                 NLOHMANN_JSON_PASTE62, \
+                                                 NLOHMANN_JSON_PASTE61, \
+                                                 NLOHMANN_JSON_PASTE60, \
+                                                 NLOHMANN_JSON_PASTE59, \
+                                                 NLOHMANN_JSON_PASTE58, \
+                                                 NLOHMANN_JSON_PASTE57, \
+                                                 NLOHMANN_JSON_PASTE56, \
+                                                 NLOHMANN_JSON_PASTE55, \
+                                                 NLOHMANN_JSON_PASTE54, \
+                                                 NLOHMANN_JSON_PASTE53, \
+                                                 NLOHMANN_JSON_PASTE52, \
+                                                 NLOHMANN_JSON_PASTE51, \
+                                                 NLOHMANN_JSON_PASTE50, \
+                                                 NLOHMANN_JSON_PASTE49, \
+                                                 NLOHMANN_JSON_PASTE48, \
+                                                 NLOHMANN_JSON_PASTE47, \
+                                                 NLOHMANN_JSON_PASTE46, \
+                                                 NLOHMANN_JSON_PASTE45, \
+                                                 NLOHMANN_JSON_PASTE44, \
+                                                 NLOHMANN_JSON_PASTE43, \
+                                                 NLOHMANN_JSON_PASTE42, \
+                                                 NLOHMANN_JSON_PASTE41, \
+                                                 NLOHMANN_JSON_PASTE40, \
+                                                 NLOHMANN_JSON_PASTE39, \
+                                                 NLOHMANN_JSON_PASTE38, \
+                                                 NLOHMANN_JSON_PASTE37, \
+                                                 NLOHMANN_JSON_PASTE36, \
+                                                 NLOHMANN_JSON_PASTE35, \
+                                                 NLOHMANN_JSON_PASTE34, \
+                                                 NLOHMANN_JSON_PASTE33, \
+                                                 NLOHMANN_JSON_PASTE32, \
+                                                 NLOHMANN_JSON_PASTE31, \
+                                                 NLOHMANN_JSON_PASTE30, \
+                                                 NLOHMANN_JSON_PASTE29, \
+                                                 NLOHMANN_JSON_PASTE28, \
+                                                 NLOHMANN_JSON_PASTE27, \
+                                                 NLOHMANN_JSON_PASTE26, \
+                                                 NLOHMANN_JSON_PASTE25, \
+                                                 NLOHMANN_JSON_PASTE24, \
+                                                 NLOHMANN_JSON_PASTE23, \
+                                                 NLOHMANN_JSON_PASTE22, \
+                                                 NLOHMANN_JSON_PASTE21, \
+                                                 NLOHMANN_JSON_PASTE20, \
+                                                 NLOHMANN_JSON_PASTE19, \
+                                                 NLOHMANN_JSON_PASTE18, \
+                                                 NLOHMANN_JSON_PASTE17, \
+                                                 NLOHMANN_JSON_PASTE16, \
+                                                 NLOHMANN_JSON_PASTE15, \
+                                                 NLOHMANN_JSON_PASTE14, \
+                                                 NLOHMANN_JSON_PASTE13, \
+                                                 NLOHMANN_JSON_PASTE12, \
+                                                 NLOHMANN_JSON_PASTE11, \
+                                                 NLOHMANN_JSON_PASTE10, \
+                                                 NLOHMANN_JSON_PASTE9,  \
+                                                 NLOHMANN_JSON_PASTE8,  \
+                                                 NLOHMANN_JSON_PASTE7,  \
+                                                 NLOHMANN_JSON_PASTE6,  \
+                                                 NLOHMANN_JSON_PASTE5,  \
+                                                 NLOHMANN_JSON_PASTE4,  \
+                                                 NLOHMANN_JSON_PASTE3,  \
+                                                 NLOHMANN_JSON_PASTE2,  \
+                                                 NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
+#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
+#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
+#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
+#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
+#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
+#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
+#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
+#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
+#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
+#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
+#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
+#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
+#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
+#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) \
+    NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
+#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                    \
+    NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
+#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                         \
+    NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
+#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                              \
+    NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
+#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                                   \
+    NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
+#define NLOHMANN_JSON_PASTE20(                                                                  \
+    func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                              \
+    NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
+#define NLOHMANN_JSON_PASTE21(                                                                       \
+    func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                   \
+    NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
+#define NLOHMANN_JSON_PASTE22(                                                                            \
+    func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                        \
+    NLOHMANN_JSON_PASTE21(                                                                                \
+        func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
+#define NLOHMANN_JSON_PASTE23(                                                                                 \
+    func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                             \
+    NLOHMANN_JSON_PASTE22(                                                                                     \
+        func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
+#define NLOHMANN_JSON_PASTE24(                                                                                      \
+    func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) \
+    NLOHMANN_JSON_PASTE2(func, v1)                                                                                  \
+    NLOHMANN_JSON_PASTE23(                                                                                          \
+        func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
+#define NLOHMANN_JSON_PASTE25(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE24(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24)
+#define NLOHMANN_JSON_PASTE26(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE25(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25)
+#define NLOHMANN_JSON_PASTE27(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE26(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26)
+#define NLOHMANN_JSON_PASTE28(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE27(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27)
+#define NLOHMANN_JSON_PASTE29(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE28(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28)
+#define NLOHMANN_JSON_PASTE30(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE29(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29)
+#define NLOHMANN_JSON_PASTE31(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE30(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30)
+#define NLOHMANN_JSON_PASTE32(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE31(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31)
+#define NLOHMANN_JSON_PASTE33(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE32(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32)
+#define NLOHMANN_JSON_PASTE34(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE33(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33)
+#define NLOHMANN_JSON_PASTE35(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE34(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34)
+#define NLOHMANN_JSON_PASTE36(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE35(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35)
+#define NLOHMANN_JSON_PASTE37(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE36(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36)
+#define NLOHMANN_JSON_PASTE38(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE37(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37)
+#define NLOHMANN_JSON_PASTE39(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE38(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38)
+#define NLOHMANN_JSON_PASTE40(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE39(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39)
+#define NLOHMANN_JSON_PASTE41(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE40(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40)
+#define NLOHMANN_JSON_PASTE42(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE41(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41)
+#define NLOHMANN_JSON_PASTE43(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE42(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42)
+#define NLOHMANN_JSON_PASTE44(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE43(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43)
+#define NLOHMANN_JSON_PASTE45(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE44(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44)
+#define NLOHMANN_JSON_PASTE46(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE45(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45)
+#define NLOHMANN_JSON_PASTE47(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE46(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46)
+#define NLOHMANN_JSON_PASTE48(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE47(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47)
+#define NLOHMANN_JSON_PASTE49(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE48(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48)
+#define NLOHMANN_JSON_PASTE50(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE49(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49)
+#define NLOHMANN_JSON_PASTE51(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE50(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50)
+#define NLOHMANN_JSON_PASTE52(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE51(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51)
+#define NLOHMANN_JSON_PASTE53(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE52(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52)
+#define NLOHMANN_JSON_PASTE54(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE53(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53)
+#define NLOHMANN_JSON_PASTE55(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE54(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54)
+#define NLOHMANN_JSON_PASTE56(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE55(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55)
+#define NLOHMANN_JSON_PASTE57(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE56(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56)
+#define NLOHMANN_JSON_PASTE58(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE57(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57)
+#define NLOHMANN_JSON_PASTE59(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57,  \
+                              v58)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE58(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57,      \
+                          v58)
+#define NLOHMANN_JSON_PASTE60(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57,  \
+                              v58,  \
+                              v59)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE59(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57,      \
+                          v58,      \
+                          v59)
+#define NLOHMANN_JSON_PASTE61(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57,  \
+                              v58,  \
+                              v59,  \
+                              v60)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE60(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57,      \
+                          v58,      \
+                          v59,      \
+                          v60)
+#define NLOHMANN_JSON_PASTE62(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57,  \
+                              v58,  \
+                              v59,  \
+                              v60,  \
+                              v61)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE61(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57,      \
+                          v58,      \
+                          v59,      \
+                          v60,      \
+                          v61)
+#define NLOHMANN_JSON_PASTE63(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57,  \
+                              v58,  \
+                              v59,  \
+                              v60,  \
+                              v61,  \
+                              v62)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE62(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57,      \
+                          v58,      \
+                          v59,      \
+                          v60,      \
+                          v61,      \
+                          v62)
+#define NLOHMANN_JSON_PASTE64(func, \
+                              v1,   \
+                              v2,   \
+                              v3,   \
+                              v4,   \
+                              v5,   \
+                              v6,   \
+                              v7,   \
+                              v8,   \
+                              v9,   \
+                              v10,  \
+                              v11,  \
+                              v12,  \
+                              v13,  \
+                              v14,  \
+                              v15,  \
+                              v16,  \
+                              v17,  \
+                              v18,  \
+                              v19,  \
+                              v20,  \
+                              v21,  \
+                              v22,  \
+                              v23,  \
+                              v24,  \
+                              v25,  \
+                              v26,  \
+                              v27,  \
+                              v28,  \
+                              v29,  \
+                              v30,  \
+                              v31,  \
+                              v32,  \
+                              v33,  \
+                              v34,  \
+                              v35,  \
+                              v36,  \
+                              v37,  \
+                              v38,  \
+                              v39,  \
+                              v40,  \
+                              v41,  \
+                              v42,  \
+                              v43,  \
+                              v44,  \
+                              v45,  \
+                              v46,  \
+                              v47,  \
+                              v48,  \
+                              v49,  \
+                              v50,  \
+                              v51,  \
+                              v52,  \
+                              v53,  \
+                              v54,  \
+                              v55,  \
+                              v56,  \
+                              v57,  \
+                              v58,  \
+                              v59,  \
+                              v60,  \
+                              v61,  \
+                              v62,  \
+                              v63)  \
+    NLOHMANN_JSON_PASTE2(func, v1)  \
+    NLOHMANN_JSON_PASTE63(func,     \
+                          v2,       \
+                          v3,       \
+                          v4,       \
+                          v5,       \
+                          v6,       \
+                          v7,       \
+                          v8,       \
+                          v9,       \
+                          v10,      \
+                          v11,      \
+                          v12,      \
+                          v13,      \
+                          v14,      \
+                          v15,      \
+                          v16,      \
+                          v17,      \
+                          v18,      \
+                          v19,      \
+                          v20,      \
+                          v21,      \
+                          v22,      \
+                          v23,      \
+                          v24,      \
+                          v25,      \
+                          v26,      \
+                          v27,      \
+                          v28,      \
+                          v29,      \
+                          v30,      \
+                          v31,      \
+                          v32,      \
+                          v33,      \
+                          v34,      \
+                          v35,      \
+                          v36,      \
+                          v37,      \
+                          v38,      \
+                          v39,      \
+                          v40,      \
+                          v41,      \
+                          v42,      \
+                          v43,      \
+                          v44,      \
+                          v45,      \
+                          v46,      \
+                          v47,      \
+                          v48,      \
+                          v49,      \
+                          v50,      \
+                          v51,      \
+                          v52,      \
+                          v53,      \
+                          v54,      \
+                          v55,      \
+                          v56,      \
+                          v57,      \
+                          v58,      \
+                          v59,      \
+                          v60,      \
+                          v61,      \
+                          v62,      \
+                          v63)
+
+#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
+#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
+#define NLOHMANN_JSON_FROM_WITH_DEFAULT(v1) \
+    nlohmann_json_t.v1 = nlohmann_json_j.value(#v1, nlohmann_json_default_obj.v1);
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)                                         \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) {   \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__))          \
+    }                                                                                     \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__))        \
+    }
+
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...)                                  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) {         \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__))                \
+    }                                                                                           \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) {       \
+        const Type nlohmann_json_default_obj{};                                                 \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) \
+    }
+
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...)                        \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__))        \
+    }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)                                     \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) {   \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__))          \
+    }                                                                                     \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__))        \
+    }
+
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...)                    \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__))        \
+    }
+
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...)                              \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) {         \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__))                \
+    }                                                                                           \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) {       \
+        const Type nlohmann_json_default_obj{};                                                 \
+        NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) \
+    }
+
+// inspired from https://stackoverflow.com/a/26745591
+// allows to call any std function as if (e.g. with begin):
+// using std::begin; begin(x);
+//
+// it allows using the detected idiom to retrieve the return type
+// of such an expression
+#define NLOHMANN_CAN_CALL_STD_FUNC_IMPL(std_name)                                                     \
+    namespace detail {                                                                                \
+    using std::std_name;                                                                              \
+                                                                                                      \
+    template <typename... T>                                                                          \
+    using result_of_##std_name = decltype(std_name(std::declval<T>()...));                            \
+    }                                                                                                 \
+                                                                                                      \
+    namespace detail2 {                                                                               \
+    struct std_name##_tag {};                                                                         \
+                                                                                                      \
+    template <typename... T>                                                                          \
+    std_name##_tag                                                                                    \
+    std_name(T&&...);                                                                                 \
+                                                                                                      \
+    template <typename... T>                                                                          \
+    using result_of_##std_name = decltype(std_name(std::declval<T>()...));                            \
+                                                                                                      \
+    template <typename... T>                                                                          \
+    struct would_call_std_##std_name {                                                                \
+        static constexpr auto const value =                                                           \
+            ::nlohmann::detail::is_detected_exact<std_name##_tag, result_of_##std_name, T...>::value; \
+    };                                                                                                \
+    } /* namespace detail2 */                                                                         \
+                                                                                                      \
+    template <typename... T>                                                                          \
+    struct would_call_std_##std_name : detail2::would_call_std_##std_name<T...> {}
+
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+#define JSON_USE_IMPLICIT_CONVERSIONS 1
+#endif
+
+#if JSON_USE_IMPLICIT_CONVERSIONS
+#define JSON_EXPLICIT
+#else
+#define JSON_EXPLICIT explicit
+#endif
+
+#ifndef JSON_DISABLE_ENUM_SERIALIZATION
+#define JSON_DISABLE_ENUM_SERIALIZATION 0
+#endif
+
+#ifndef JSON_USE_GLOBAL_UDLS
+#define JSON_USE_GLOBAL_UDLS 1
+#endif
+
+#if JSON_HAS_THREE_WAY_COMPARISON
+#include <compare>  // partial_ordering
+#endif
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+///////////////////////////
+// JSON type enumeration //
+///////////////////////////
+
+/*!
+@brief the JSON type enumeration
+
+This enumeration collects the different JSON types. It is internally used to
+distinguish the stored values, and the functions @ref basic_json::is_null(),
+@ref basic_json::is_object(), @ref basic_json::is_array(),
+@ref basic_json::is_string(), @ref basic_json::is_boolean(),
+@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
+@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
+@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
+@ref basic_json::is_structured() rely on it.
+
+@note There are three enumeration entries (number_integer, number_unsigned, and
+number_float), because the library distinguishes these three types for numbers:
+@ref basic_json::number_unsigned_t is used for unsigned integers,
+@ref basic_json::number_integer_t is used for signed integers, and
+@ref basic_json::number_float_t is used for floating-point numbers or to
+approximate integers which do not fit in the limits of their respective type.
+
+@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON
+value with the default value for a given type
+
+@since version 1.0.0
+*/
+enum class value_t : std::uint8_t {
+    null,             ///< null value
+    object,           ///< object (unordered set of name/value pairs)
+    array,            ///< array (ordered collection of values)
+    string,           ///< string value
+    boolean,          ///< boolean value
+    number_integer,   ///< number value (signed integer)
+    number_unsigned,  ///< number value (unsigned integer)
+    number_float,     ///< number value (floating-point)
+    binary,           ///< binary array (ordered collection of bytes)
+    discarded         ///< discarded by the parser callback function
+};
+
+/*!
+@brief comparison operator for JSON types
+
+Returns an ordering that is similar to Python:
+- order: null < boolean < number < object < array < string < binary
+- furthermore, each type is not smaller than itself
+- discarded values are not comparable
+- binary is represented as a b"" string in python and directly comparable to a
+  string; however, making a binary array directly comparable with a string would
+  be surprising behavior in a JSON file.
+
+@since version 1.0.0
+*/
+#if JSON_HAS_THREE_WAY_COMPARISON
+inline std::partial_ordering
+operator<=>(const value_t lhs, const value_t rhs) noexcept  // *NOPAD*
+#else
+inline bool
+operator<(const value_t lhs, const value_t rhs) noexcept
+#endif
+{
+    static constexpr std::array<std::uint8_t, 9> order = {{
+        0 /* null */,
+        3 /* object */,
+        4 /* array */,
+        5 /* string */,
+        1 /* boolean */,
+        2 /* integer */,
+        2 /* unsigned */,
+        2 /* float */,
+        6 /* binary */
+    }};
+
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+#if JSON_HAS_THREE_WAY_COMPARISON
+    if (l_index < order.size() && r_index < order.size()) {
+        return order[l_index] <=> order[r_index];  // *NOPAD*
+    }
+    return std::partial_ordering::unordered;
+#else
+    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
+#endif
+}
+
+// GCC selects the built-in operator< over an operator rewritten from
+// a user-defined spaceship operator
+// Clang, MSVC, and ICC select the rewritten candidate
+// (see GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105200)
+#if JSON_HAS_THREE_WAY_COMPARISON && defined(__GNUC__)
+inline bool
+operator<(const value_t lhs, const value_t rhs) noexcept {
+    return std::is_lt(lhs <=> rhs);  // *NOPAD*
+}
+#endif
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/string_escape.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/*!
+@brief replace all occurrences of a substring by another string
+
+@param[in,out] s  the string to manipulate; changed so that all
+               occurrences of @a f are replaced with @a t
+@param[in]     f  the substring to replace with @a t
+@param[in]     t  the string to replace @a f
+
+@pre The search string @a f must not be empty. **This precondition is
+enforced with an assertion.**
+
+@since version 2.0.0
+*/
+template <typename StringType>
+inline void
+replace_substring(StringType& s, const StringType& f, const StringType& t) {
+    JSON_ASSERT(!f.empty());
+    for (auto pos = s.find(f);             // find first occurrence of f
+         pos != StringType::npos;          // make sure f was found
+         s.replace(pos, f.size(), t),      // replace with t, and
+         pos = s.find(f, pos + t.size()))  // find next occurrence of f
+    {
+    }
+}
+
+/*!
+ * @brief string escaping as described in RFC 6901 (Sect. 4)
+ * @param[in] s string to escape
+ * @return    escaped string
+ *
+ * Note the order of escaping "~" to "~0" and "/" to "~1" is important.
+ */
+template <typename StringType>
+inline StringType
+escape(StringType s) {
+    replace_substring(s, StringType{"~"}, StringType{"~0"});
+    replace_substring(s, StringType{"/"}, StringType{"~1"});
+    return s;
+}
+
+/*!
+ * @brief string unescaping as described in RFC 6901 (Sect. 4)
+ * @param[in] s string to unescape
+ * @return    unescaped string
+ *
+ * Note the order of escaping "~1" to "/" and "~0" to "~" is important.
+ */
+template <typename StringType>
+static void
+unescape(StringType& s) {
+    replace_substring(s, StringType{"~1"}, StringType{"/"});
+    replace_substring(s, StringType{"~0"}, StringType{"~"});
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/position_t.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstddef>  // size_t
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/// struct to capture the start position of the current token
+struct position_t {
+    /// the total number of characters read
+    std::size_t chars_read_total = 0;
+    /// the number of characters read in the current line
+    std::size_t chars_read_current_line = 0;
+    /// the number of lines read
+    std::size_t lines_read = 0;
+
+    /// conversion to size_t to preserve SAX interface
+    constexpr
+    operator size_t() const {
+        return chars_read_total;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-FileCopyrightText: 2018 The Abseil Authors
+// SPDX-License-Identifier: MIT
+
+#include <array>        // array
+#include <cstddef>      // size_t
+#include <type_traits>  // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+#include <utility>      // index_sequence, make_index_sequence, index_sequence_for
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+#ifdef JSON_HAS_CPP_14
+
+// the following utilities are natively available in C++14
+using std::enable_if_t;
+using std::index_sequence;
+using std::index_sequence_for;
+using std::make_index_sequence;
+
+#else
+
+// alias templates to reduce boilerplate
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+// The following code is taken from
+// https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h which is
+// part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0.
+
+//// START OF CODE FROM GOOGLE ABSEIL
+
+// integer_sequence
+//
+// Class template representing a compile-time integer sequence. An instantiation
+// of `integer_sequence<T, Ints...>` has a sequence of integers encoded in its
+// type through its template arguments (which is a common need when
+// working with C++11 variadic templates). `absl::integer_sequence` is designed
+// to be a drop-in replacement for C++14's `std::integer_sequence`.
+//
+// Example:
+//
+//   template< class T, T... Ints >
+//   void user_function(integer_sequence<T, Ints...>);
+//
+//   int main()
+//   {
+//     // user_function's `T` will be deduced to `int` and `Ints...`
+//     // will be deduced to `0, 1, 2, 3, 4`.
+//     user_function(make_integer_sequence<int, 5>());
+//   }
+template <typename T, T... Ints>
+struct integer_sequence {
+    using value_type = T;
+    static constexpr std::size_t
+    size() noexcept {
+        return sizeof...(Ints);
+    }
+};
+
+// index_sequence
+//
+// A helper template for an `integer_sequence` of `size_t`,
+// `absl::index_sequence` is designed to be a drop-in replacement for C++14's
+// `std::index_sequence`.
+template <size_t... Ints>
+using index_sequence = integer_sequence<size_t, Ints...>;
+
+namespace utility_internal {
+
+template <typename Seq, size_t SeqSize, size_t Rem>
+struct Extend;
+
+// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency.
+template <typename T, T... Ints, size_t SeqSize>
+struct Extend<integer_sequence<T, Ints...>, SeqSize, 0> {
+    using type = integer_sequence<T, Ints..., (Ints + SeqSize)...>;
+};
+
+template <typename T, T... Ints, size_t SeqSize>
+struct Extend<integer_sequence<T, Ints...>, SeqSize, 1> {
+    using type = integer_sequence<T, Ints..., (Ints + SeqSize)..., 2 * SeqSize>;
+};
+
+// Recursion helper for 'make_integer_sequence<T, N>'.
+// 'Gen<T, N>::type' is an alias for 'integer_sequence<T, 0, 1, ... N-1>'.
+template <typename T, size_t N>
+struct Gen {
+    using type = typename Extend<typename Gen<T, N / 2>::type, N / 2, N % 2>::type;
+};
+
+template <typename T>
+struct Gen<T, 0> {
+    using type = integer_sequence<T>;
+};
+
+}  // namespace utility_internal
+
+// Compile-time sequences of integers
+
+// make_integer_sequence
+//
+// This template alias is equivalent to
+// `integer_sequence<int, 0, 1, ..., N-1>`, and is designed to be a drop-in
+// replacement for C++14's `std::make_integer_sequence`.
+template <typename T, T N>
+using make_integer_sequence = typename utility_internal::Gen<T, N>::type;
+
+// make_index_sequence
+//
+// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`,
+// and is designed to be a drop-in replacement for C++14's
+// `std::make_index_sequence`.
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+
+// index_sequence_for
+//
+// Converts a typename pack into an index sequence of the same length, and
+// is designed to be a drop-in replacement for C++14's
+// `std::index_sequence_for()`
+template <typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+//// END OF CODE FROM GOOGLE ABSEIL
+
+#endif
+
+// dispatch utility (taken from ranges-v3)
+template <unsigned N>
+struct priority_tag : priority_tag<N - 1> {};
+template <>
+struct priority_tag<0> {};
+
+// taken from ranges-v3
+template <typename T>
+struct static_const {
+    static JSON_INLINE_VARIABLE constexpr T value{};
+};
+
+#ifndef JSON_HAS_CPP_17
+template <typename T>
+constexpr T static_const<T>::value;
+#endif
+
+template <typename T, typename... Args>
+inline constexpr std::array<T, sizeof...(Args)>
+make_array(Args&&... args) {
+    return std::array<T, sizeof...(Args)>{{static_cast<T>(std::forward<Args>(args))...}};
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <limits>       // numeric_limits
+#include <type_traits>  // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility>      // declval
+#include <tuple>        // tuple
+#include <string>       // char_traits
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <iterator>  // random_access_iterator_tag
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename It, typename = void>
+struct iterator_types {};
+
+template <typename It>
+struct iterator_types<It,
+                      void_t<typename It::difference_type,
+                             typename It::value_type,
+                             typename It::pointer,
+                             typename It::reference,
+                             typename It::iterator_category>> {
+    using difference_type   = typename It::difference_type;
+    using value_type        = typename It::value_type;
+    using pointer           = typename It::pointer;
+    using reference         = typename It::reference;
+    using iterator_category = typename It::iterator_category;
+};
+
+// This is required as some compilers implement std::iterator_traits in a way that
+// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
+template <typename T, typename = void>
+struct iterator_traits {};
+
+template <typename T>
+struct iterator_traits<T, enable_if_t<!std::is_pointer<T>::value>> : iterator_types<T> {};
+
+template <typename T>
+struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>> {
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type        = T;
+    using difference_type   = ptrdiff_t;
+    using pointer           = T*;
+    using reference         = T&;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/call_std/begin.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+NLOHMANN_CAN_CALL_STD_FUNC_IMPL(begin);
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/call_std/end.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+NLOHMANN_CAN_CALL_STD_FUNC_IMPL(end);
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint>  // int64_t, uint64_t
+#include <map>      // map
+#include <memory>   // allocator
+#include <string>   // string
+#include <vector>   // vector
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template <typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+/// a class to store JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/
+template <template <typename U, typename V, typename... Args> class ObjectType = std::map,
+          template <typename U, typename... Args> class ArrayType              = std::vector,
+          class StringType                                                     = std::string,
+          class BooleanType                                                    = bool,
+          class NumberIntegerType                                              = std::int64_t,
+          class NumberUnsignedType                                             = std::uint64_t,
+          class NumberFloatType                                                = double,
+          template <typename U> class AllocatorType                            = std::allocator,
+          template <typename T, typename SFINAE = void> class JSONSerializer   = adl_serializer,
+          class BinaryType      = std::vector<std::uint8_t>,  // cppcheck-suppress syntaxError
+          class CustomBaseClass = void>
+class basic_json;
+
+/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+/// @sa https://json.nlohmann.me/api/json_pointer/
+template <typename RefStringType>
+class json_pointer;
+
+/*!
+@brief default specialization
+@sa https://json.nlohmann.me/api/json/
+*/
+using json = basic_json<>;
+
+/// @brief a minimal map-like container that preserves insertion order
+/// @sa https://json.nlohmann.me/api/ordered_map/
+template <class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/// @brief specialization that maintains the insertion order of object keys
+/// @sa https://json.nlohmann.me/api/ordered_json/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+NLOHMANN_JSON_NAMESPACE_END
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+/*!
+@brief detail namespace with internal helper functions
+
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
+
+@since version 2.1.0
+*/
+namespace detail {
+
+/////////////
+// helpers //
+/////////////
+
+// Note to maintainers:
+//
+// Every trait in this file expects a non CV-qualified type.
+// The only exceptions are in the 'aliases for detected' section
+// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
+//
+// In this case, T has to be properly CV-qualified to constraint the function arguments
+// (e.g. to_json(BasicJsonType&, const T&))
+
+template <typename>
+struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+// used by exceptions create() member functions
+// true_type for pointer to possibly cv-qualified basic_json or std::nullptr_t
+// false_type otherwise
+template <typename BasicJsonContext>
+struct is_basic_json_context
+    : std::integral_constant<
+          bool,
+          is_basic_json<typename std::remove_cv<typename std::remove_pointer<BasicJsonContext>::type>::type>::value ||
+              std::is_same<BasicJsonContext, std::nullptr_t>::value> {};
+
+//////////////////////
+// json_ref helpers //
+//////////////////////
+
+template <typename>
+class json_ref;
+
+template <typename>
+struct is_json_ref : std::false_type {};
+
+template <typename T>
+struct is_json_ref<json_ref<T>> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template <typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template <typename T>
+using key_type_t = typename T::key_type;
+
+template <typename T>
+using value_type_t = typename T::value_type;
+
+template <typename T>
+using difference_type_t = typename T::difference_type;
+
+template <typename T>
+using pointer_t = typename T::pointer;
+
+template <typename T>
+using reference_t = typename T::reference;
+
+template <typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template <typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template <typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template <typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template <typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+// trait checking if j.get<T> is valid
+// use this trait instead of std::is_constructible or std::is_convertible,
+// both rely on, or make use of implicit conversions, and thus fail when T
+// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
+template <typename BasicJsonType, typename T>
+struct is_getable {
+    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
+};
+
+template <typename BasicJsonType, typename T>
+struct has_from_json<BasicJsonType, T, enable_if_t<!is_basic_json<T>::value>> {
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer, const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template <typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template <typename BasicJsonType, typename T>
+struct has_non_default_from_json<BasicJsonType, T, enable_if_t<!is_basic_json<T>::value>> {
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value = is_detected_exact<T, from_json_function, serializer, const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template <typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template <typename BasicJsonType, typename T>
+struct has_to_json<BasicJsonType, T, enable_if_t<!is_basic_json<T>::value>> {
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value = is_detected_exact<void, to_json_function, serializer, BasicJsonType&, T>::value;
+};
+
+template <typename T>
+using detect_key_compare = typename T::key_compare;
+
+template <typename T>
+struct has_key_compare : std::integral_constant<bool, is_detected<detect_key_compare, T>::value> {};
+
+// obtains the actual object key comparator
+template <typename BasicJsonType>
+struct actual_object_comparator {
+    using object_t            = typename BasicJsonType::object_t;
+    using object_comparator_t = typename BasicJsonType::default_object_comparator_t;
+    using type                = typename std::
+        conditional<has_key_compare<object_t>::value, typename object_t::key_compare, object_comparator_t>::type;
+};
+
+template <typename BasicJsonType>
+using actual_object_comparator_t = typename actual_object_comparator<BasicJsonType>::type;
+
+/////////////////
+// char_traits //
+/////////////////
+
+// Primary template of char_traits calls std char_traits
+template <typename T>
+struct char_traits : std::char_traits<T> {};
+
+// Explicitly define char traits for unsigned char since it is not standard
+template <>
+struct char_traits<unsigned char> : std::char_traits<char> {
+    using char_type = unsigned char;
+    using int_type  = uint64_t;
+
+    // Redefine to_int_type function
+    static int_type
+    to_int_type(char_type c) noexcept {
+        return static_cast<int_type>(c);
+    }
+
+    static char_type
+    to_char_type(int_type i) noexcept {
+        return static_cast<char_type>(i);
+    }
+
+    static constexpr int_type
+    eof() noexcept {
+        return static_cast<int_type>(EOF);
+    }
+};
+
+// Explicitly define char traits for signed char since it is not standard
+template <>
+struct char_traits<signed char> : std::char_traits<char> {
+    using char_type = signed char;
+    using int_type  = uint64_t;
+
+    // Redefine to_int_type function
+    static int_type
+    to_int_type(char_type c) noexcept {
+        return static_cast<int_type>(c);
+    }
+
+    static char_type
+    to_char_type(int_type i) noexcept {
+        return static_cast<char_type>(i);
+    }
+
+    static constexpr int_type
+    eof() noexcept {
+        return static_cast<int_type>(EOF);
+    }
+};
+
+///////////////////
+// is_ functions //
+///////////////////
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template <class...>
+struct conjunction : std::true_type {};
+template <class B>
+struct conjunction<B> : B {};
+template <class B, class... Bn>
+struct conjunction<B, Bn...> : std::conditional<static_cast<bool>(B::value), conjunction<Bn...>, B>::type {};
+
+// https://en.cppreference.com/w/cpp/types/negation
+template <class B>
+struct negation : std::integral_constant<bool, !B::value> {};
+
+// Reimplementation of is_constructible and is_default_constructible, due to them being broken for
+// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367).
+// This causes compile errors in e.g. clang 3.5 or gcc 4.9.
+template <typename T>
+struct is_default_constructible : std::is_default_constructible<T> {};
+
+template <typename T1, typename T2>
+struct is_default_constructible<std::pair<T1, T2>>
+    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+
+template <typename T1, typename T2>
+struct is_default_constructible<const std::pair<T1, T2>>
+    : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+
+template <typename... Ts>
+struct is_default_constructible<std::tuple<Ts...>> : conjunction<is_default_constructible<Ts>...> {};
+
+template <typename... Ts>
+struct is_default_constructible<const std::tuple<Ts...>> : conjunction<is_default_constructible<Ts>...> {};
+
+template <typename T, typename... Args>
+struct is_constructible : std::is_constructible<T, Args...> {};
+
+template <typename T1, typename T2>
+struct is_constructible<std::pair<T1, T2>> : is_default_constructible<std::pair<T1, T2>> {};
+
+template <typename T1, typename T2>
+struct is_constructible<const std::pair<T1, T2>> : is_default_constructible<const std::pair<T1, T2>> {};
+
+template <typename... Ts>
+struct is_constructible<std::tuple<Ts...>> : is_default_constructible<std::tuple<Ts...>> {};
+
+template <typename... Ts>
+struct is_constructible<const std::tuple<Ts...>> : is_default_constructible<const std::tuple<Ts...>> {};
+
+template <typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template <typename T>
+struct is_iterator_traits<iterator_traits<T>> {
+   private:
+    using traits = iterator_traits<T>;
+
+   public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value && is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value && is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+template <typename T>
+struct is_range {
+   private:
+    using t_ref = typename std::add_lvalue_reference<T>::type;
+
+    using iterator = detected_t<result_of_begin, t_ref>;
+    using sentinel = detected_t<result_of_end, t_ref>;
+
+    // to be 100% correct, it should use https://en.cppreference.com/w/cpp/iterator/input_or_output_iterator
+    // and https://en.cppreference.com/w/cpp/iterator/sentinel_for
+    // but reimplementing these would be too much work, as a lot of other concepts are used underneath
+    static constexpr auto is_iterator_begin = is_iterator_traits<iterator_traits<iterator>>::value;
+
+   public:
+    static constexpr bool value =
+        !std::is_same<iterator, nonesuch>::value && !std::is_same<sentinel, nonesuch>::value && is_iterator_begin;
+};
+
+template <typename R>
+using iterator_t = enable_if_t<is_range<R>::value, result_of_begin<decltype(std::declval<R&>())>>;
+
+template <typename T>
+using range_value_t = value_type_t<iterator_traits<iterator_t<T>>>;
+
+// The following implementation of is_complete_type is taken from
+// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/
+// and is written by Xiang Fan who agreed to using it in this library.
+
+template <typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template <typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template <typename BasicJsonType, typename CompatibleObjectType, typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl<BasicJsonType,
+                                      CompatibleObjectType,
+                                      enable_if_t<is_detected<mapped_type_t, CompatibleObjectType>::value &&
+                                                  is_detected<key_type_t, CompatibleObjectType>::value>> {
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        is_constructible<typename object_t::key_type, typename CompatibleObjectType::key_type>::value &&
+        is_constructible<typename object_t::mapped_type, typename CompatibleObjectType::mapped_type>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template <typename BasicJsonType, typename ConstructibleObjectType, typename = void>
+struct is_constructible_object_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type_impl<BasicJsonType,
+                                         ConstructibleObjectType,
+                                         enable_if_t<is_detected<mapped_type_t, ConstructibleObjectType>::value &&
+                                                     is_detected<key_type_t, ConstructibleObjectType>::value>> {
+    using object_t = typename BasicJsonType::object_t;
+
+    static constexpr bool value =
+        (is_default_constructible<ConstructibleObjectType>::value &&
+         (std::is_move_assignable<ConstructibleObjectType>::value ||
+          std::is_copy_assignable<ConstructibleObjectType>::value) &&
+         (is_constructible<typename ConstructibleObjectType::key_type, typename object_t::key_type>::value &&
+          std::is_same<typename object_t::mapped_type, typename ConstructibleObjectType::mapped_type>::value)) ||
+        (has_from_json<BasicJsonType, typename ConstructibleObjectType::mapped_type>::value ||
+         has_non_default_from_json<BasicJsonType, typename ConstructibleObjectType::mapped_type>::value);
+};
+
+template <typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type : is_constructible_object_type_impl<BasicJsonType, ConstructibleObjectType> {};
+
+template <typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type {
+    static constexpr auto value = is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template <typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type {
+    // launder type through decltype() to fix compilation failure on ICPC
+#ifdef __INTEL_COMPILER
+    using laundered_type = decltype(std::declval<ConstructibleStringType>());
+#else
+    using laundered_type = ConstructibleStringType;
+#endif
+
+    static constexpr auto value = conjunction<
+        is_constructible<laundered_type, typename BasicJsonType::string_t>,
+        is_detected_exact<typename BasicJsonType::string_t::value_type, value_type_t, laundered_type>>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl<
+    BasicJsonType,
+    CompatibleArrayType,
+    enable_if_t<is_detected<iterator_t, CompatibleArrayType>::value &&
+                is_iterator_traits<iterator_traits<detected_t<iterator_t, CompatibleArrayType>>>::value &&
+                // special case for types like std::filesystem::path whose iterator's value_type are themselves
+                // c.f. https://github.com/nlohmann/json/pull/3073
+                !std::is_same<CompatibleArrayType, detected_t<range_value_t, CompatibleArrayType>>::value>> {
+    static constexpr bool value = is_constructible<BasicJsonType, range_value_t<CompatibleArrayType>>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template <typename BasicJsonType, typename ConstructibleArrayType, typename = void>
+struct is_constructible_array_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl<
+    BasicJsonType,
+    ConstructibleArrayType,
+    enable_if_t<std::is_same<ConstructibleArrayType, typename BasicJsonType::value_type>::value>> : std::true_type {};
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl<
+    BasicJsonType,
+    ConstructibleArrayType,
+    enable_if_t<!std::is_same<ConstructibleArrayType, typename BasicJsonType::value_type>::value &&
+                !is_compatible_string_type<BasicJsonType, ConstructibleArrayType>::value &&
+                is_default_constructible<ConstructibleArrayType>::value &&
+                (std::is_move_assignable<ConstructibleArrayType>::value ||
+                 std::is_copy_assignable<ConstructibleArrayType>::value) &&
+                is_detected<iterator_t, ConstructibleArrayType>::value &&
+                is_iterator_traits<iterator_traits<detected_t<iterator_t, ConstructibleArrayType>>>::value &&
+                is_detected<range_value_t, ConstructibleArrayType>::value &&
+                // special case for types like std::filesystem::path whose iterator's value_type are themselves
+                // c.f. https://github.com/nlohmann/json/pull/3073
+                !std::is_same<ConstructibleArrayType, detected_t<range_value_t, ConstructibleArrayType>>::value &&
+                is_complete_type<detected_t<range_value_t, ConstructibleArrayType>>::value>> {
+    using value_type = range_value_t<ConstructibleArrayType>;
+
+    static constexpr bool value = std::is_same<value_type, typename BasicJsonType::array_t::value_type>::value ||
+                                  has_from_json<BasicJsonType, value_type>::value ||
+                                  has_non_default_from_json<BasicJsonType, value_type>::value;
+};
+
+template <typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType, typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl<
+    RealIntegerType,
+    CompatibleNumberIntegerType,
+    enable_if_t<std::is_integral<RealIntegerType>::value && std::is_integral<CompatibleNumberIntegerType>::value &&
+                !std::is_same<bool, CompatibleNumberIntegerType>::value>> {
+    // is there an assert somewhere on overflows?
+    using RealLimits       = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value = is_constructible<RealIntegerType, CompatibleNumberIntegerType>::value &&
+                                  CompatibleLimits::is_integer && RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template <typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type : is_compatible_integer_type_impl<RealIntegerType, CompatibleNumberIntegerType> {};
+
+template <typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl : std::false_type {};
+
+template <typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl<BasicJsonType, CompatibleType, enable_if_t<is_complete_type<CompatibleType>::value>> {
+    static constexpr bool value = has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template <typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+
+template <typename T1, typename T2>
+struct is_constructible_tuple : std::false_type {};
+
+template <typename T1, typename... Args>
+struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<is_constructible<T1, Args>...> {};
+
+template <typename BasicJsonType, typename T>
+struct is_json_iterator_of : std::false_type {};
+
+template <typename BasicJsonType>
+struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::iterator> : std::true_type {};
+
+template <typename BasicJsonType>
+struct is_json_iterator_of<BasicJsonType, typename BasicJsonType::const_iterator> : std::true_type {};
+
+// checks if a given type T is a template specialization of Primary
+template <template <typename...> class Primary, typename T>
+struct is_specialization_of : std::false_type {};
+
+template <template <typename...> class Primary, typename... Args>
+struct is_specialization_of<Primary, Primary<Args...>> : std::true_type {};
+
+template <typename T>
+using is_json_pointer = is_specialization_of<::nlohmann::json_pointer, uncvref_t<T>>;
+
+// checks if A and B are comparable using Compare functor
+template <typename Compare, typename A, typename B, typename = void>
+struct is_comparable : std::false_type {};
+
+template <typename Compare, typename A, typename B>
+struct is_comparable<Compare,
+                     A,
+                     B,
+                     void_t<decltype(std::declval<Compare>()(std::declval<A>(), std::declval<B>())),
+                            decltype(std::declval<Compare>()(std::declval<B>(), std::declval<A>()))>> : std::true_type {
+};
+
+template <typename T>
+using detect_is_transparent = typename T::is_transparent;
+
+// type trait to check if KeyType can be used as object key (without a BasicJsonType)
+// see is_usable_as_basic_json_key_type below
+template <typename Comparator,
+          typename ObjectKeyType,
+          typename KeyTypeCVRef,
+          bool RequireTransparentComparator = true,
+          bool ExcludeObjectKeyType         = RequireTransparentComparator,
+          typename KeyType                  = uncvref_t<KeyTypeCVRef>>
+using is_usable_as_key_type =
+    typename std::conditional<is_comparable<Comparator, ObjectKeyType, KeyTypeCVRef>::value &&
+                                  !(ExcludeObjectKeyType && std::is_same<KeyType, ObjectKeyType>::value) &&
+                                  (!RequireTransparentComparator ||
+                                   is_detected<detect_is_transparent, Comparator>::value) &&
+                                  !is_json_pointer<KeyType>::value,
+                              std::true_type,
+                              std::false_type>::type;
+
+// type trait to check if KeyType can be used as object key
+// true if:
+//   - KeyType is comparable with BasicJsonType::object_t::key_type
+//   - if ExcludeObjectKeyType is true, KeyType is not BasicJsonType::object_t::key_type
+//   - the comparator is transparent or RequireTransparentComparator is false
+//   - KeyType is not a JSON iterator or json_pointer
+template <typename BasicJsonType,
+          typename KeyTypeCVRef,
+          bool RequireTransparentComparator = true,
+          bool ExcludeObjectKeyType         = RequireTransparentComparator,
+          typename KeyType                  = uncvref_t<KeyTypeCVRef>>
+using is_usable_as_basic_json_key_type =
+    typename std::conditional<is_usable_as_key_type<typename BasicJsonType::object_comparator_t,
+                                                    typename BasicJsonType::object_t::key_type,
+                                                    KeyTypeCVRef,
+                                                    RequireTransparentComparator,
+                                                    ExcludeObjectKeyType>::value &&
+                                  !is_json_iterator_of<BasicJsonType, KeyType>::value,
+                              std::true_type,
+                              std::false_type>::type;
+
+template <typename ObjectType, typename KeyType>
+using detect_erase_with_key_type = decltype(std::declval<ObjectType&>().erase(std::declval<KeyType>()));
+
+// type trait to check if object_t has an erase() member functions accepting KeyType
+template <typename BasicJsonType, typename KeyType>
+using has_erase_with_key_type =
+    typename std::conditional<is_detected<detect_erase_with_key_type, typename BasicJsonType::object_t, KeyType>::value,
+                              std::true_type,
+                              std::false_type>::type;
+
+// a naive helper to check if a type is an ordered_map (exploits the fact that
+// ordered_map inherits capacity() from std::vector)
+template <typename T>
+struct is_ordered_map {
+    using one = char;
+
+    struct two {
+        char x[2];  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+    };
+
+    template <typename C>
+    static one
+    test(decltype(&C::capacity));
+    template <typename C>
+    static two
+    test(...);
+
+    enum {
+        value = sizeof(test<T>(nullptr)) == sizeof(char)
+    };  // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+};
+
+// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
+template <typename T, typename U, enable_if_t<!std::is_same<T, U>::value, int> = 0>
+T
+conditional_static_cast(U value) {
+    return static_cast<T>(value);
+}
+
+template <typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
+T
+conditional_static_cast(U value) {
+    return value;
+}
+
+template <typename... Types>
+using all_integral = conjunction<std::is_integral<Types>...>;
+
+template <typename... Types>
+using all_signed = conjunction<std::is_signed<Types>...>;
+
+template <typename... Types>
+using all_unsigned = conjunction<std::is_unsigned<Types>...>;
+
+// there's a disjunction trait in another PR; replace when merged
+template <typename... Types>
+using same_sign = std::integral_constant<bool, all_signed<Types...>::value || all_unsigned<Types...>::value>;
+
+template <typename OfType, typename T>
+using never_out_of_range = std::integral_constant<bool,
+                                                  (std::is_signed<OfType>::value && (sizeof(T) < sizeof(OfType))) ||
+                                                      (same_sign<OfType, T>::value && sizeof(OfType) == sizeof(T))>;
+
+template <typename OfType,
+          typename T,
+          bool OfTypeSigned = std::is_signed<OfType>::value,
+          bool TSigned      = std::is_signed<T>::value>
+struct value_in_range_of_impl2;
+
+template <typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, false, false> {
+    static constexpr bool
+    test(T val) {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template <typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, true, false> {
+    static constexpr bool
+    test(T val) {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template <typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, false, true> {
+    static constexpr bool
+    test(T val) {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return val >= 0 &&
+               static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template <typename OfType, typename T>
+struct value_in_range_of_impl2<OfType, T, true, true> {
+    static constexpr bool
+    test(T val) {
+        using CommonType = typename std::common_type<OfType, T>::type;
+        return static_cast<CommonType>(val) >= static_cast<CommonType>((std::numeric_limits<OfType>::min)()) &&
+               static_cast<CommonType>(val) <= static_cast<CommonType>((std::numeric_limits<OfType>::max)());
+    }
+};
+
+template <typename OfType,
+          typename T,
+          bool NeverOutOfRange = never_out_of_range<OfType, T>::value,
+          typename             = detail::enable_if_t<all_integral<OfType, T>::value>>
+struct value_in_range_of_impl1;
+
+template <typename OfType, typename T>
+struct value_in_range_of_impl1<OfType, T, false> {
+    static constexpr bool
+    test(T val) {
+        return value_in_range_of_impl2<OfType, T>::test(val);
+    }
+};
+
+template <typename OfType, typename T>
+struct value_in_range_of_impl1<OfType, T, true> {
+    static constexpr bool
+    test(T /*val*/) {
+        return true;
+    }
+};
+
+template <typename OfType, typename T>
+inline constexpr bool
+value_in_range_of(T val) {
+    return value_in_range_of_impl1<OfType, T>::test(val);
+}
+
+template <bool Value>
+using bool_constant = std::integral_constant<bool, Value>;
+
+///////////////////////////////////////////////////////////////////////////////
+// is_c_string
+///////////////////////////////////////////////////////////////////////////////
+
+namespace impl {
+
+template <typename T>
+inline constexpr bool
+is_c_string() {
+    using TUnExt   = typename std::remove_extent<T>::type;
+    using TUnCVExt = typename std::remove_cv<TUnExt>::type;
+    using TUnPtr   = typename std::remove_pointer<T>::type;
+    using TUnCVPtr = typename std::remove_cv<TUnPtr>::type;
+    return (std::is_array<T>::value && std::is_same<TUnCVExt, char>::value) ||
+           (std::is_pointer<T>::value && std::is_same<TUnCVPtr, char>::value);
+}
+
+}  // namespace impl
+
+// checks whether T is a [cv] char */[cv] char[] C string
+template <typename T>
+struct is_c_string : bool_constant<impl::is_c_string<T>()> {};
+
+template <typename T>
+using is_c_string_uncvref = is_c_string<uncvref_t<T>>;
+
+///////////////////////////////////////////////////////////////////////////////
+// is_transparent
+///////////////////////////////////////////////////////////////////////////////
+
+namespace impl {
+
+template <typename T>
+inline constexpr bool
+is_transparent() {
+    return is_detected<detect_is_transparent, T>::value;
+}
+
+}  // namespace impl
+
+// checks whether T has a member named is_transparent
+template <typename T>
+struct is_transparent : bool_constant<impl::is_transparent<T>()> {};
+
+///////////////////////////////////////////////////////////////////////////////
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/string_concat.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstring>  // strlen
+#include <string>   // string
+#include <utility>  // forward
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+inline std::size_t
+concat_length() {
+    return 0;
+}
+
+template <typename... Args>
+inline std::size_t
+concat_length(const char* cstr, const Args&... rest);
+
+template <typename StringType, typename... Args>
+inline std::size_t
+concat_length(const StringType& str, const Args&... rest);
+
+template <typename... Args>
+inline std::size_t
+concat_length(const char /*c*/, const Args&... rest) {
+    return 1 + concat_length(rest...);
+}
+
+template <typename... Args>
+inline std::size_t
+concat_length(const char* cstr, const Args&... rest) {
+    // cppcheck-suppress ignoredReturnValue
+    return ::strlen(cstr) + concat_length(rest...);
+}
+
+template <typename StringType, typename... Args>
+inline std::size_t
+concat_length(const StringType& str, const Args&... rest) {
+    return str.size() + concat_length(rest...);
+}
+
+template <typename OutStringType>
+inline void
+concat_into(OutStringType& /*out*/) {}
+
+template <typename StringType, typename Arg>
+using string_can_append = decltype(std::declval<StringType&>().append(std::declval<Arg&&>()));
+
+template <typename StringType, typename Arg>
+using detect_string_can_append = is_detected<string_can_append, StringType, Arg>;
+
+template <typename StringType, typename Arg>
+using string_can_append_op = decltype(std::declval<StringType&>() += std::declval<Arg&&>());
+
+template <typename StringType, typename Arg>
+using detect_string_can_append_op = is_detected<string_can_append_op, StringType, Arg>;
+
+template <typename StringType, typename Arg>
+using string_can_append_iter =
+    decltype(std::declval<StringType&>().append(std::declval<const Arg&>().begin(), std::declval<const Arg&>().end()));
+
+template <typename StringType, typename Arg>
+using detect_string_can_append_iter = is_detected<string_can_append_iter, StringType, Arg>;
+
+template <typename StringType, typename Arg>
+using string_can_append_data =
+    decltype(std::declval<StringType&>().append(std::declval<const Arg&>().data(), std::declval<const Arg&>().size()));
+
+template <typename StringType, typename Arg>
+using detect_string_can_append_data = is_detected<string_can_append_data, StringType, Arg>;
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<!detect_string_can_append<OutStringType, Arg>::value &&
+                          detect_string_can_append_op<OutStringType, Arg>::value,
+                      int> = 0>
+inline void
+concat_into(OutStringType& out, Arg&& arg, Args&&... rest);
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<!detect_string_can_append<OutStringType, Arg>::value &&
+                          !detect_string_can_append_op<OutStringType, Arg>::value &&
+                          detect_string_can_append_iter<OutStringType, Arg>::value,
+                      int> = 0>
+inline void
+concat_into(OutStringType& out, const Arg& arg, Args&&... rest);
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<!detect_string_can_append<OutStringType, Arg>::value &&
+                          !detect_string_can_append_op<OutStringType, Arg>::value &&
+                          !detect_string_can_append_iter<OutStringType, Arg>::value &&
+                          detect_string_can_append_data<OutStringType, Arg>::value,
+                      int> = 0>
+inline void
+concat_into(OutStringType& out, const Arg& arg, Args&&... rest);
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<detect_string_can_append<OutStringType, Arg>::value, int> = 0>
+inline void
+concat_into(OutStringType& out, Arg&& arg, Args&&... rest) {
+    out.append(std::forward<Arg>(arg));
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<!detect_string_can_append<OutStringType, Arg>::value &&
+                          detect_string_can_append_op<OutStringType, Arg>::value,
+                      int>>
+inline void
+concat_into(OutStringType& out, Arg&& arg, Args&&... rest) {
+    out += std::forward<Arg>(arg);
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<!detect_string_can_append<OutStringType, Arg>::value &&
+                          !detect_string_can_append_op<OutStringType, Arg>::value &&
+                          detect_string_can_append_iter<OutStringType, Arg>::value,
+                      int>>
+inline void
+concat_into(OutStringType& out, const Arg& arg, Args&&... rest) {
+    out.append(arg.begin(), arg.end());
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template <typename OutStringType,
+          typename Arg,
+          typename... Args,
+          enable_if_t<!detect_string_can_append<OutStringType, Arg>::value &&
+                          !detect_string_can_append_op<OutStringType, Arg>::value &&
+                          !detect_string_can_append_iter<OutStringType, Arg>::value &&
+                          detect_string_can_append_data<OutStringType, Arg>::value,
+                      int>>
+inline void
+concat_into(OutStringType& out, const Arg& arg, Args&&... rest) {
+    out.append(arg.data(), arg.size());
+    concat_into(out, std::forward<Args>(rest)...);
+}
+
+template <typename OutStringType = std::string, typename... Args>
+inline OutStringType
+concat(Args&&... args) {
+    OutStringType str;
+    str.reserve(concat_length(args...));
+    concat_into(str, std::forward<Args>(args)...);
+    return str;
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+////////////////
+// exceptions //
+////////////////
+
+/// @brief general exception of the @ref basic_json class
+/// @sa https://json.nlohmann.me/api/basic_json/exception/
+class exception : public std::exception {
+   public:
+    /// returns the explanatory string
+    const char*
+    what() const noexcept override {
+        return m.what();
+    }
+
+    /// the id of the exception
+    const int id;  // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+
+   protected:
+    JSON_HEDLEY_NON_NULL(3)
+    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}  // NOLINT(bugprone-throw-keyword-missing)
+
+    static std::string
+    name(const std::string& ename, int id_) {
+        return concat("[json.exception.", ename, '.', std::to_string(id_), "] ");
+    }
+
+    static std::string
+    diagnostics(std::nullptr_t /*leaf_element*/) {
+        return "";
+    }
+
+    template <typename BasicJsonType>
+    static std::string
+    diagnostics(const BasicJsonType* leaf_element) {
+#if JSON_DIAGNOSTICS
+        std::vector<std::string> tokens;
+        for (const auto* current = leaf_element; current != nullptr && current->m_parent != nullptr;
+             current             = current->m_parent) {
+            switch (current->m_parent->type()) {
+                case value_t::array: {
+                    for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i) {
+                        if (&current->m_parent->m_data.m_value.array->operator[](i) == current) {
+                            tokens.emplace_back(std::to_string(i));
+                            break;
+                        }
+                    }
+                    break;
+                }
+
+                case value_t::object: {
+                    for (const auto& element : *current->m_parent->m_data.m_value.object) {
+                        if (&element.second == current) {
+                            tokens.emplace_back(element.first.c_str());
+                            break;
+                        }
+                    }
+                    break;
+                }
+
+                case value_t::null:             // LCOV_EXCL_LINE
+                case value_t::string:           // LCOV_EXCL_LINE
+                case value_t::boolean:          // LCOV_EXCL_LINE
+                case value_t::number_integer:   // LCOV_EXCL_LINE
+                case value_t::number_unsigned:  // LCOV_EXCL_LINE
+                case value_t::number_float:     // LCOV_EXCL_LINE
+                case value_t::binary:           // LCOV_EXCL_LINE
+                case value_t::discarded:        // LCOV_EXCL_LINE
+                default:                        // LCOV_EXCL_LINE
+                    break;                      // LCOV_EXCL_LINE
+            }
+        }
+
+        if (tokens.empty()) {
+            return "";
+        }
+
+        auto str = std::accumulate(
+            tokens.rbegin(), tokens.rend(), std::string{}, [](const std::string& a, const std::string& b) {
+                return concat(a, '/', detail::escape(b));
+            });
+        return concat('(', str, ") ");
+#else
+        static_cast<void>(leaf_element);
+        return "";
+#endif
+    }
+
+   private:
+    /// an exception object as storage for error messages
+    std::runtime_error m;
+};
+
+/// @brief exception indicating a parse error
+/// @sa https://json.nlohmann.me/api/basic_json/parse_error/
+class parse_error : public exception {
+   public:
+    /*!
+    @brief create a parse error exception
+    @param[in] id_       the id of the exception
+    @param[in] pos       the position where the error occurred (or with
+                         chars_read_total=0 if the position cannot be
+                         determined)
+    @param[in] what_arg  the explanatory string
+    @return parse_error object
+    */
+    template <typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static parse_error
+    create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context) {
+        const std::string w = concat(exception::name("parse_error", id_),
+                                     "parse error",
+                                     position_string(pos),
+                                     ": ",
+                                     exception::diagnostics(context),
+                                     what_arg);
+        return {id_, pos.chars_read_total, w.c_str()};
+    }
+
+    template <typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static parse_error
+    create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context) {
+        const std::string w = concat(exception::name("parse_error", id_),
+                                     "parse error",
+                                     (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""),
+                                     ": ",
+                                     exception::diagnostics(context),
+                                     what_arg);
+        return {id_, byte_, w.c_str()};
+    }
+
+    /*!
+    @brief byte index of the parse error
+
+    The byte index of the last read character in the input file.
+
+    @note For an input with n bytes, 1 is the index of the first character and
+          n+1 is the index of the terminating null byte or the end of file.
+          This also holds true when reading a byte vector (CBOR or MessagePack).
+    */
+    const std::size_t byte;
+
+   private:
+    parse_error(int id_, std::size_t byte_, const char* what_arg) : exception(id_, what_arg), byte(byte_) {}
+
+    static std::string
+    position_string(const position_t& pos) {
+        return concat(
+            " at line ", std::to_string(pos.lines_read + 1), ", column ", std::to_string(pos.chars_read_current_line));
+    }
+};
+
+/// @brief exception indicating errors with iterators
+/// @sa https://json.nlohmann.me/api/basic_json/invalid_iterator/
+class invalid_iterator : public exception {
+   public:
+    template <typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static invalid_iterator
+    create(int id_, const std::string& what_arg, BasicJsonContext context) {
+        const std::string w =
+            concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+   private:
+    JSON_HEDLEY_NON_NULL(3)
+    invalid_iterator(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/// @brief exception indicating executing a member function with a wrong type
+/// @sa https://json.nlohmann.me/api/basic_json/type_error/
+class type_error : public exception {
+   public:
+    template <typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static type_error
+    create(int id_, const std::string& what_arg, BasicJsonContext context) {
+        const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+   private:
+    JSON_HEDLEY_NON_NULL(3)
+    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/// @brief exception indicating access out of the defined range
+/// @sa https://json.nlohmann.me/api/basic_json/out_of_range/
+class out_of_range : public exception {
+   public:
+    template <typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static out_of_range
+    create(int id_, const std::string& what_arg, BasicJsonContext context) {
+        const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+   private:
+    JSON_HEDLEY_NON_NULL(3)
+    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/// @brief exception indicating other library errors
+/// @sa https://json.nlohmann.me/api/basic_json/other_error/
+class other_error : public exception {
+   public:
+    template <typename BasicJsonContext, enable_if_t<is_basic_json_context<BasicJsonContext>::value, int> = 0>
+    static other_error
+    create(int id_, const std::string& what_arg, BasicJsonContext context) {
+        const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg);
+        return {id_, w.c_str()};
+    }
+
+   private:
+    JSON_HEDLEY_NON_NULL(3)
+    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/identity_tag.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+// dispatching helper struct
+template <class T>
+struct identity_tag {};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/std_fs.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+#if JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#include <experimental/filesystem>
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+namespace std_fs = std::experimental::filesystem;
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+#elif JSON_HAS_FILESYSTEM
+#include <filesystem>
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+namespace std_fs = std::filesystem;
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+#endif
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename std::nullptr_t& n) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_null())) {
+        JSON_THROW(type_error::create(302, concat("type must be null, but is ", j.type_name()), &j));
+    }
+    n = nullptr;
+}
+
+// overloads for basic_json template parameters
+template <typename BasicJsonType,
+          typename ArithmeticType,
+          enable_if_t<std::is_arithmetic<ArithmeticType>::value &&
+                          !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                      int> = 0>
+void
+get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val) {
+    switch (static_cast<value_t>(j)) {
+        case value_t::number_unsigned: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+
+        case value_t::null:
+        case value_t::object:
+        case value_t::array:
+        case value_t::string:
+        case value_t::boolean:
+        case value_t::binary:
+        case value_t::discarded:
+        default:
+            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
+    }
+}
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean())) {
+        JSON_THROW(type_error::create(302, concat("type must be boolean, but is ", j.type_name()), &j));
+    }
+    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
+}
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string())) {
+        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
+    }
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <typename BasicJsonType,
+          typename StringType,
+          enable_if_t<
+              std::is_assignable<StringType&, const typename BasicJsonType::string_t>::value &&
+                  is_detected_exact<typename BasicJsonType::string_t::value_type, value_type_t, StringType>::value &&
+                  !std::is_same<typename BasicJsonType::string_t, StringType>::value && !is_json_ref<StringType>::value,
+              int> = 0>
+inline void
+from_json(const BasicJsonType& j, StringType& s) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string())) {
+        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val) {
+    get_arithmetic_value(j, val);
+}
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val) {
+    get_arithmetic_value(j, val);
+}
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val) {
+    get_arithmetic_value(j, val);
+}
+
+#if !JSON_DISABLE_ENUM_SERIALIZATION
+template <typename BasicJsonType, typename EnumType, enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+inline void
+from_json(const BasicJsonType& j, EnumType& e) {
+    typename std::underlying_type<EnumType>::type val;
+    get_arithmetic_value(j, val);
+    e = static_cast<EnumType>(val);
+}
+#endif  // JSON_DISABLE_ENUM_SERIALIZATION
+
+// forward_list doesn't have an insert method
+template <typename BasicJsonType,
+          typename T,
+          typename Allocator,
+          enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+inline void
+from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    l.clear();
+    std::transform(
+        j.rbegin(), j.rend(), std::front_inserter(l), [](const BasicJsonType& i) { return i.template get<T>(); });
+}
+
+// valarray doesn't have an insert method
+template <typename BasicJsonType, typename T, enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+inline void
+from_json(const BasicJsonType& j, std::valarray<T>& l) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    l.resize(j.size());
+    std::transform(j.begin(), j.end(), std::begin(l), [](const BasicJsonType& elem) { return elem.template get<T>(); });
+}
+
+template <typename BasicJsonType, typename T, std::size_t N>
+auto
+from_json(const BasicJsonType& j,
+          T (&arr)[N])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+    -> decltype(j.template get<T>(), void()) {
+    for (std::size_t i = 0; i < N; ++i) {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template <typename BasicJsonType>
+inline void
+from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/) {
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
+
+template <typename BasicJsonType, typename T, std::size_t N>
+auto
+from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr, priority_tag<2> /*unused*/)
+    -> decltype(j.template get<T>(), void()) {
+    for (std::size_t i = 0; i < N; ++i) {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template <typename BasicJsonType,
+          typename ConstructibleArrayType,
+          enable_if_t<std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value, int> = 0>
+auto
+from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
+    -> decltype(arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
+                j.template get<typename ConstructibleArrayType::value_type>(),
+                void()) {
+    using std::end;
+
+    ConstructibleArrayType ret;
+    ret.reserve(j.size());
+    std::transform(j.begin(), j.end(), std::inserter(ret, end(ret)), [](const BasicJsonType& i) {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template <typename BasicJsonType,
+          typename ConstructibleArrayType,
+          enable_if_t<std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value, int> = 0>
+inline void
+from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<0> /*unused*/) {
+    using std::end;
+
+    ConstructibleArrayType ret;
+    std::transform(j.begin(), j.end(), std::inserter(ret, end(ret)), [](const BasicJsonType& i) {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template <typename BasicJsonType,
+          typename ConstructibleArrayType,
+          enable_if_t<is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value &&
+                          !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value &&
+                          !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value &&
+                          !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value &&
+                          !is_basic_json<ConstructibleArrayType>::value,
+                      int> = 0>
+auto
+from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
+    -> decltype(from_json_array_impl(j, arr, priority_tag<3>{}),
+                j.template get<typename ConstructibleArrayType::value_type>(),
+                void()) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+
+    from_json_array_impl(j, arr, priority_tag<3>{});
+}
+
+template <typename BasicJsonType, typename T, std::size_t... Idx>
+std::array<T, sizeof...(Idx)>
+from_json_inplace_array_impl(BasicJsonType&& j,
+                             identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/,
+                             index_sequence<Idx...> /*unused*/) {
+    return {{std::forward<BasicJsonType>(j).at(Idx).template get<T>()...}};
+}
+
+template <typename BasicJsonType, typename T, std::size_t N>
+auto
+from_json(BasicJsonType&& j, identity_tag<std::array<T, N>> tag)
+    -> decltype(from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N>{})) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+
+    return from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N>{});
+}
+
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_binary())) {
+        JSON_THROW(type_error::create(302, concat("type must be binary, but is ", j.type_name()), &j));
+    }
+
+    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
+}
+
+template <typename BasicJsonType,
+          typename ConstructibleObjectType,
+          enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
+inline void
+from_json(const BasicJsonType& j, ConstructibleObjectType& obj) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_object())) {
+        JSON_THROW(type_error::create(302, concat("type must be object, but is ", j.type_name()), &j));
+    }
+
+    ConstructibleObjectType ret;
+    const auto* inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
+    using value_type         = typename ConstructibleObjectType::value_type;
+    std::transform(inner_object->begin(),
+                   inner_object->end(),
+                   std::inserter(ret, ret.begin()),
+                   [](typename BasicJsonType::object_t::value_type const& p) {
+                       return value_type(p.first,
+                                         p.second.template get<typename ConstructibleObjectType::mapped_type>());
+                   });
+    obj = std::move(ret);
+}
+
+// overload for arithmetic types, not chosen for basic_json template arguments
+// (BooleanType, etc..); note: Is it really necessary to provide explicit
+// overloads for boolean_t etc. in case of a custom BooleanType which is not
+// an arithmetic type?
+template <typename BasicJsonType,
+          typename ArithmeticType,
+          enable_if_t<std::is_arithmetic<ArithmeticType>::value &&
+                          !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value &&
+                          !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value &&
+                          !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value &&
+                          !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                      int> = 0>
+inline void
+from_json(const BasicJsonType& j, ArithmeticType& val) {
+    switch (static_cast<value_t>(j)) {
+        case value_t::number_unsigned: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+        case value_t::boolean: {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
+            break;
+        }
+
+        case value_t::null:
+        case value_t::object:
+        case value_t::array:
+        case value_t::string:
+        case value_t::binary:
+        case value_t::discarded:
+        default:
+            JSON_THROW(type_error::create(302, concat("type must be number, but is ", j.type_name()), &j));
+    }
+}
+
+template <typename BasicJsonType, typename... Args, std::size_t... Idx>
+std::tuple<Args...>
+from_json_tuple_impl_base(BasicJsonType&& j, index_sequence<Idx...> /*unused*/) {
+    return std::make_tuple(std::forward<BasicJsonType>(j).at(Idx).template get<Args>()...);
+}
+
+template <typename BasicJsonType, class A1, class A2>
+std::pair<A1, A2>
+from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::pair<A1, A2>> /*unused*/, priority_tag<0> /*unused*/) {
+    return {std::forward<BasicJsonType>(j).at(0).template get<A1>(),
+            std::forward<BasicJsonType>(j).at(1).template get<A2>()};
+}
+
+template <typename BasicJsonType, typename A1, typename A2>
+inline void
+from_json_tuple_impl(BasicJsonType&& j, std::pair<A1, A2>& p, priority_tag<1> /*unused*/) {
+    p = from_json_tuple_impl(std::forward<BasicJsonType>(j), identity_tag<std::pair<A1, A2>>{}, priority_tag<0>{});
+}
+
+template <typename BasicJsonType, typename... Args>
+std::tuple<Args...>
+from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::tuple<Args...>> /*unused*/, priority_tag<2> /*unused*/) {
+    return from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j),
+                                                             index_sequence_for<Args...>{});
+}
+
+template <typename BasicJsonType, typename... Args>
+inline void
+from_json_tuple_impl(BasicJsonType&& j, std::tuple<Args...>& t, priority_tag<3> /*unused*/) {
+    t = from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j),
+                                                          index_sequence_for<Args...>{});
+}
+
+template <typename BasicJsonType, typename TupleRelated>
+auto
+from_json(BasicJsonType&& j, TupleRelated&& t) -> decltype(from_json_tuple_impl(std::forward<BasicJsonType>(j),
+                                                                                std::forward<TupleRelated>(t),
+                                                                                priority_tag<3>{})) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+
+    return from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3>{});
+}
+
+template <typename BasicJsonType,
+          typename Key,
+          typename Value,
+          typename Compare,
+          typename Allocator,
+          typename = enable_if_t<!std::is_constructible<typename BasicJsonType::string_t, Key>::value>>
+inline void
+from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    m.clear();
+    for (const auto& p : j) {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array())) {
+            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+template <typename BasicJsonType,
+          typename Key,
+          typename Value,
+          typename Hash,
+          typename KeyEqual,
+          typename Allocator,
+          typename = enable_if_t<!std::is_constructible<typename BasicJsonType::string_t, Key>::value>>
+inline void
+from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array())) {
+        JSON_THROW(type_error::create(302, concat("type must be array, but is ", j.type_name()), &j));
+    }
+    m.clear();
+    for (const auto& p : j) {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array())) {
+            JSON_THROW(type_error::create(302, concat("type must be array, but is ", p.type_name()), &j));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
+template <typename BasicJsonType>
+inline void
+from_json(const BasicJsonType& j, std_fs::path& p) {
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string())) {
+        JSON_THROW(type_error::create(302, concat("type must be string, but is ", j.type_name()), &j));
+    }
+    p = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+#endif
+
+struct from_json_fn {
+    template <typename BasicJsonType, typename T>
+    auto
+    operator()(const BasicJsonType& j, T&& val) const noexcept(noexcept(from_json(j, std::forward<T>(val))))
+        -> decltype(from_json(j, std::forward<T>(val))) {
+        return from_json(j, std::forward<T>(val));
+    }
+};
+
+}  // namespace detail
+
+#ifndef JSON_HAS_CPP_17
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace  // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
+{
+#endif
+JSON_INLINE_VARIABLE constexpr const auto& from_json =  // NOLINT(misc-definitions-in-headers)
+    detail::static_const<detail::from_json_fn>::value;
+#ifndef JSON_HAS_CPP_17
+}  // namespace
+#endif
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>    // copy
+#include <iterator>     // begin, end
+#include <string>       // string
+#include <tuple>        // tuple, get
+#include <type_traits>  // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility>      // move, forward, declval, pair
+#include <valarray>     // valarray
+#include <vector>       // vector
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstddef>   // size_t
+#include <iterator>  // input_iterator_tag
+#include <string>    // string, to_string
+#include <tuple>     // tuple_size, get, tuple_element
+#include <utility>   // move
+
+#if JSON_HAS_RANGES
+#include <ranges>  // enable_borrowed_range
+#endif
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename string_type>
+void
+int_to_string(string_type& target, std::size_t value) {
+    // For ADL
+    using std::to_string;
+    target = to_string(value);
+}
+template <typename IteratorType>
+class iteration_proxy_value {
+   public:
+    using difference_type   = std::ptrdiff_t;
+    using value_type        = iteration_proxy_value;
+    using pointer           = value_type*;
+    using reference         = value_type&;
+    using iterator_category = std::input_iterator_tag;
+    using string_type       = typename std::remove_cv<
+              typename std::remove_reference<decltype(std::declval<IteratorType>().key())>::type>::type;
+
+   private:
+    /// the iterator
+    IteratorType anchor{};
+    /// an index for arrays (used to create key names)
+    std::size_t array_index = 0;
+    /// last stringified array index
+    mutable std::size_t array_index_last = 0;
+    /// a string representation of the array index
+    mutable string_type array_index_str = "0";
+    /// an empty string (to return a reference for primitive values)
+    string_type empty_str{};
+
+   public:
+    explicit iteration_proxy_value() = default;
+    explicit iteration_proxy_value(IteratorType it, std::size_t array_index_ = 0) noexcept(
+        std::is_nothrow_move_constructible<IteratorType>::value &&
+        std::is_nothrow_default_constructible<string_type>::value)
+        : anchor(std::move(it)), array_index(array_index_) {}
+
+    iteration_proxy_value(iteration_proxy_value const&) = default;
+    iteration_proxy_value&
+    operator=(iteration_proxy_value const&) = default;
+    // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions
+    iteration_proxy_value(iteration_proxy_value&&) noexcept(std::is_nothrow_move_constructible<IteratorType>::value &&
+                                                            std::is_nothrow_move_constructible<string_type>::value) =
+        default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
+    iteration_proxy_value&
+    operator=(iteration_proxy_value&&) noexcept(std::is_nothrow_move_assignable<IteratorType>::value &&
+                                                std::is_nothrow_move_assignable<string_type>::value) =
+        default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations)
+    ~iteration_proxy_value() = default;
+
+    /// dereference operator (needed for range-based for)
+    const iteration_proxy_value&
+    operator*() const {
+        return *this;
+    }
+
+    /// increment operator (needed for range-based for)
+    iteration_proxy_value&
+    operator++() {
+        ++anchor;
+        ++array_index;
+
+        return *this;
+    }
+
+    iteration_proxy_value
+    operator++(int) &  // NOLINT(cert-dcl21-cpp)
+    {
+        auto tmp = iteration_proxy_value(anchor, array_index);
+        ++anchor;
+        ++array_index;
+        return tmp;
+    }
+
+    /// equality operator (needed for InputIterator)
+    bool
+    operator==(const iteration_proxy_value& o) const {
+        return anchor == o.anchor;
+    }
+
+    /// inequality operator (needed for range-based for)
+    bool
+    operator!=(const iteration_proxy_value& o) const {
+        return anchor != o.anchor;
+    }
+
+    /// return key of the iterator
+    const string_type&
+    key() const {
+        JSON_ASSERT(anchor.m_object != nullptr);
+
+        switch (anchor.m_object->type()) {
+            // use integer array index as key
+            case value_t::array: {
+                if (array_index != array_index_last) {
+                    int_to_string(array_index_str, array_index);
+                    array_index_last = array_index;
+                }
+                return array_index_str;
+            }
+
+            // use key from the object
+            case value_t::object:
+                return anchor.key();
+
+            // use an empty key for all primitive types
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return empty_str;
+        }
+    }
+
+    /// return value of the iterator
+    typename IteratorType::reference
+    value() const {
+        return anchor.value();
+    }
+};
+
+/// proxy class for the items() function
+template <typename IteratorType>
+class iteration_proxy {
+   private:
+    /// the container to iterate
+    typename IteratorType::pointer container = nullptr;
+
+   public:
+    explicit iteration_proxy() = default;
+
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept : container(&cont) {}
+
+    iteration_proxy(iteration_proxy const&) = default;
+    iteration_proxy&
+    operator=(iteration_proxy const&)           = default;
+    iteration_proxy(iteration_proxy&&) noexcept = default;
+    iteration_proxy&
+    operator=(iteration_proxy&&) noexcept = default;
+    ~iteration_proxy()                    = default;
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_value<IteratorType>
+    begin() const noexcept {
+        return iteration_proxy_value<IteratorType>(container->begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_value<IteratorType>
+    end() const noexcept {
+        return iteration_proxy_value<IteratorType>(container->end());
+    }
+};
+
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template <std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
+auto
+get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key()) {
+    return i.key();
+}
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template <std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
+auto
+get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value()) {
+    return i.value();
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// The Addition to the STD Namespace is required to add
+// Structured Bindings Support to the iteration_proxy_value class
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+namespace std {
+
+#if defined(__clang__)
+// Fix: https://github.com/nlohmann/json/issues/1401
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename IteratorType>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>  // NOLINT(cert-dcl58-cpp)
+    : public std::integral_constant<std::size_t, 2> {};
+
+template <std::size_t N, typename IteratorType>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType>>  // NOLINT(cert-dcl58-cpp)
+{
+   public:
+    using type = decltype(get<N>(std::declval<::nlohmann::detail::iteration_proxy_value<IteratorType>>()));
+};
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+}  // namespace std
+
+#if JSON_HAS_RANGES
+template <typename IteratorType>
+inline constexpr bool ::std::ranges::enable_borrowed_range<::nlohmann::detail::iteration_proxy<IteratorType>> = true;
+#endif
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/std_fs.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+//////////////////
+// constructors //
+//////////////////
+
+/*
+ * Note all external_constructor<>::construct functions need to call
+ * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an
+ * allocated value (e.g., a string). See bug issue
+ * https://github.com/nlohmann/json/issues/2865 for more information.
+ */
+
+template <value_t>
+struct external_constructor;
+
+template <>
+struct external_constructor<value_t::boolean> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::boolean;
+        j.m_data.m_value = b;
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::string> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, const typename BasicJsonType::string_t& s) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::string;
+        j.m_data.m_value = s;
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::string_t&& s) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::string;
+        j.m_data.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType,
+              typename CompatibleStringType,
+              enable_if_t<!std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value, int> = 0>
+    static void
+    construct(BasicJsonType& j, const CompatibleStringType& str) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type         = value_t::string;
+        j.m_data.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::binary> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::binary;
+        j.m_data.m_value = typename BasicJsonType::binary_t(b);
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::binary;
+        j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b));
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::number_float> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::number_float;
+        j.m_data.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::number_unsigned> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::number_unsigned;
+        j.m_data.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::number_integer> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::number_integer;
+        j.m_data.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::array> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::array;
+        j.m_data.m_value = arr;
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::array;
+        j.m_data.m_value = std::move(arr);
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType,
+              typename CompatibleArrayType,
+              enable_if_t<!std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value, int> = 0>
+    static void
+    construct(BasicJsonType& j, const CompatibleArrayType& arr) {
+        using std::begin;
+        using std::end;
+
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type        = value_t::array;
+        j.m_data.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, const std::vector<bool>& arr) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::array;
+        j.m_data.m_value = value_t::array;
+        j.m_data.m_value.array->reserve(arr.size());
+        for (const bool x : arr) {
+            j.m_data.m_value.array->push_back(x);
+            j.set_parent(j.m_data.m_value.array->back());
+        }
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType, typename T, enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void
+    construct(BasicJsonType& j, const std::valarray<T>& arr) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::array;
+        j.m_data.m_value = value_t::array;
+        j.m_data.m_value.array->resize(arr.size());
+        if (arr.size() > 0) {
+            std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin());
+        }
+        j.set_parents();
+        j.assert_invariant();
+    }
+};
+
+template <>
+struct external_constructor<value_t::object> {
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::object;
+        j.m_data.m_value = obj;
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType>
+    static void
+    construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj) {
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type  = value_t::object;
+        j.m_data.m_value = std::move(obj);
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template <typename BasicJsonType,
+              typename CompatibleObjectType,
+              enable_if_t<!std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int> = 0>
+    static void
+    construct(BasicJsonType& j, const CompatibleObjectType& obj) {
+        using std::begin;
+        using std::end;
+
+        j.m_data.m_value.destroy(j.m_data.m_type);
+        j.m_data.m_type         = value_t::object;
+        j.m_data.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.set_parents();
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+template <typename BasicJsonType,
+          typename T,
+          enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, T b) noexcept {
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template <typename BasicJsonType,
+          typename BoolRef,
+          enable_if_t<((std::is_same<std::vector<bool>::reference, BoolRef>::value &&
+                        !std::is_same<std::vector<bool>::reference, typename BasicJsonType::boolean_t&>::value) ||
+                       (std::is_same<std::vector<bool>::const_reference, BoolRef>::value &&
+                        !std::is_same<detail::uncvref_t<std::vector<bool>::const_reference>,
+                                      typename BasicJsonType::boolean_t>::value)) &&
+                          std::is_convertible<const BoolRef&, typename BasicJsonType::boolean_t>::value,
+                      int> = 0>
+inline void
+to_json(BasicJsonType& j, const BoolRef& b) noexcept {
+    external_constructor<value_t::boolean>::construct(j, static_cast<typename BasicJsonType::boolean_t>(b));
+}
+
+template <typename BasicJsonType,
+          typename CompatibleString,
+          enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, const CompatibleString& s) {
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template <typename BasicJsonType>
+inline void
+to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s) {
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template <typename BasicJsonType, typename FloatType, enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, FloatType val) noexcept {
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template <typename BasicJsonType,
+          typename CompatibleNumberUnsignedType,
+          enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t,
+                                                 CompatibleNumberUnsignedType>::value,
+                      int> = 0>
+inline void
+to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept {
+    external_constructor<value_t::number_unsigned>::construct(
+        j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template <typename BasicJsonType,
+          typename CompatibleNumberIntegerType,
+          enable_if_t<
+              is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value,
+              int> = 0>
+inline void
+to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept {
+    external_constructor<value_t::number_integer>::construct(
+        j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+#if !JSON_DISABLE_ENUM_SERIALIZATION
+template <typename BasicJsonType, typename EnumType, enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, EnumType e) noexcept {
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+}
+#endif  // JSON_DISABLE_ENUM_SERIALIZATION
+
+template <typename BasicJsonType>
+inline void
+to_json(BasicJsonType& j, const std::vector<bool>& e) {
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template <typename BasicJsonType,
+          typename CompatibleArrayType,
+          enable_if_t<is_compatible_array_type<BasicJsonType, CompatibleArrayType>::value &&
+                          !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value &&
+                          !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value &&
+                          !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value &&
+                          !is_basic_json<CompatibleArrayType>::value,
+                      int> = 0>
+inline void
+to_json(BasicJsonType& j, const CompatibleArrayType& arr) {
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template <typename BasicJsonType>
+inline void
+to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin) {
+    external_constructor<value_t::binary>::construct(j, bin);
+}
+
+template <typename BasicJsonType, typename T, enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, const std::valarray<T>& arr) {
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template <typename BasicJsonType>
+inline void
+to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr) {
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template <typename BasicJsonType,
+          typename CompatibleObjectType,
+          enable_if_t<is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value &&
+                          !is_basic_json<CompatibleObjectType>::value,
+                      int> = 0>
+inline void
+to_json(BasicJsonType& j, const CompatibleObjectType& obj) {
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template <typename BasicJsonType>
+inline void
+to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj) {
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType,
+    typename T,
+    std::size_t N,
+    enable_if_t<!std::is_constructible<typename BasicJsonType::string_t,
+                                       const T (&)[N]>::
+                    value,  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+                int> = 0>
+inline void
+to_json(BasicJsonType& j,
+        const T (&arr)[N])  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template <
+    typename BasicJsonType,
+    typename T1,
+    typename T2,
+    enable_if_t<std::is_constructible<BasicJsonType, T1>::value && std::is_constructible<BasicJsonType, T2>::value,
+                int> = 0>
+inline void
+to_json(BasicJsonType& j, const std::pair<T1, T2>& p) {
+    j = {p.first, p.second};
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template <typename BasicJsonType,
+          typename T,
+          enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, const T& b) {
+    j = {{b.key(), b.value()}};
+}
+
+template <typename BasicJsonType, typename Tuple, std::size_t... Idx>
+inline void
+to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/) {
+    j = {std::get<Idx>(t)...};
+}
+
+template <typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int> = 0>
+inline void
+to_json(BasicJsonType& j, const T& t) {
+    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value>{});
+}
+
+#if JSON_HAS_FILESYSTEM || JSON_HAS_EXPERIMENTAL_FILESYSTEM
+template <typename BasicJsonType>
+inline void
+to_json(BasicJsonType& j, const std_fs::path& p) {
+    j = p.string();
+}
+#endif
+
+struct to_json_fn {
+    template <typename BasicJsonType, typename T>
+    auto
+    operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+        -> decltype(to_json(j, std::forward<T>(val)), void()) {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+#ifndef JSON_HAS_CPP_17
+/// namespace to hold default `to_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace  // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
+{
+#endif
+JSON_INLINE_VARIABLE constexpr const auto& to_json =  // NOLINT(misc-definitions-in-headers)
+    detail::static_const<detail::to_json_fn>::value;
+#ifndef JSON_HAS_CPP_17
+}  // namespace
+#endif
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/identity_tag.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// @sa https://json.nlohmann.me/api/adl_serializer/
+template <typename ValueType, typename>
+struct adl_serializer {
+    /// @brief convert a JSON value to any value type
+    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
+    template <typename BasicJsonType, typename TargetType = ValueType>
+    static auto
+    from_json(BasicJsonType&& j,
+              TargetType& val) noexcept(noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
+        -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void()) {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /// @brief convert a JSON value to any value type
+    /// @sa https://json.nlohmann.me/api/adl_serializer/from_json/
+    template <typename BasicJsonType, typename TargetType = ValueType>
+    static auto
+    from_json(BasicJsonType&& j) noexcept(noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j),
+                                                                         detail::identity_tag<TargetType>{})))
+        -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType>{})) {
+        return ::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType>{});
+    }
+
+    /// @brief convert any value type to a JSON value
+    /// @sa https://json.nlohmann.me/api/adl_serializer/to_json/
+    template <typename BasicJsonType, typename TargetType = ValueType>
+    static auto
+    to_json(BasicJsonType& j,
+            TargetType&& val) noexcept(noexcept(::nlohmann::to_json(j, std::forward<TargetType>(val))))
+        -> decltype(::nlohmann::to_json(j, std::forward<TargetType>(val)), void()) {
+        ::nlohmann::to_json(j, std::forward<TargetType>(val));
+    }
+};
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/byte_container_with_subtype.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstdint>  // uint8_t, uint64_t
+#include <tuple>    // tie
+#include <utility>  // move
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// @brief an internal type for a backed binary type
+/// @sa https://json.nlohmann.me/api/byte_container_with_subtype/
+template <typename BinaryType>
+class byte_container_with_subtype : public BinaryType {
+   public:
+    using container_type = BinaryType;
+    using subtype_type   = std::uint64_t;
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype() noexcept(noexcept(container_type())) : container_type() {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b))) : container_type(b) {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b)) {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(const container_type& b, subtype_type subtype_) noexcept(noexcept(container_type(b)))
+        : container_type(b), m_subtype(subtype_), m_has_subtype(true) {}
+
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/byte_container_with_subtype/
+    byte_container_with_subtype(container_type&& b,
+                                subtype_type subtype_) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b)), m_subtype(subtype_), m_has_subtype(true) {}
+
+    bool
+    operator==(const byte_container_with_subtype& rhs) const {
+        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
+               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
+    }
+
+    bool
+    operator!=(const byte_container_with_subtype& rhs) const {
+        return !(rhs == *this);
+    }
+
+    /// @brief sets the binary subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/set_subtype/
+    void
+    set_subtype(subtype_type subtype_) noexcept {
+        m_subtype     = subtype_;
+        m_has_subtype = true;
+    }
+
+    /// @brief return the binary subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/subtype/
+    constexpr subtype_type
+    subtype() const noexcept {
+        return m_has_subtype ? m_subtype : static_cast<subtype_type>(-1);
+    }
+
+    /// @brief return whether the value has a subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/has_subtype/
+    constexpr bool
+    has_subtype() const noexcept {
+        return m_has_subtype;
+    }
+
+    /// @brief clears the binary subtype
+    /// @sa https://json.nlohmann.me/api/byte_container_with_subtype/clear_subtype/
+    void
+    clear_subtype() noexcept {
+        m_subtype     = 0;
+        m_has_subtype = false;
+    }
+
+   private:
+    subtype_type m_subtype = 0;
+    bool m_has_subtype     = false;
+};
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/hash.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstdint>     // uint8_t
+#include <cstddef>     // size_t
+#include <functional>  // hash
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+// boost::hash_combine
+inline std::size_t
+combine(std::size_t seed, std::size_t h) noexcept {
+    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
+    return seed;
+}
+
+/*!
+@brief hash a JSON value
+
+The hash function tries to rely on std::hash where possible. Furthermore, the
+type of the JSON value is taken into account to have different hash values for
+null, 0, 0U, and false, etc.
+
+@tparam BasicJsonType basic_json specialization
+@param j JSON value to hash
+@return hash value of j
+*/
+template <typename BasicJsonType>
+std::size_t
+hash(const BasicJsonType& j) {
+    using string_t          = typename BasicJsonType::string_t;
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+
+    const auto type = static_cast<std::size_t>(j.type());
+    switch (j.type()) {
+        case BasicJsonType::value_t::null:
+        case BasicJsonType::value_t::discarded: {
+            return combine(type, 0);
+        }
+
+        case BasicJsonType::value_t::object: {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j.items()) {
+                const auto h = std::hash<string_t>{}(element.key());
+                seed         = combine(seed, h);
+                seed         = combine(seed, hash(element.value()));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::array: {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j) {
+                seed = combine(seed, hash(element));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::string: {
+            const auto h = std::hash<string_t>{}(j.template get_ref<const string_t&>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::boolean: {
+            const auto h = std::hash<bool>{}(j.template get<bool>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_integer: {
+            const auto h = std::hash<number_integer_t>{}(j.template get<number_integer_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_unsigned: {
+            const auto h = std::hash<number_unsigned_t>{}(j.template get<number_unsigned_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_float: {
+            const auto h = std::hash<number_float_t>{}(j.template get<number_float_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::binary: {
+            auto seed    = combine(type, j.get_binary().size());
+            const auto h = std::hash<bool>{}(j.get_binary().has_subtype());
+            seed         = combine(seed, h);
+            seed         = combine(seed, static_cast<std::size_t>(j.get_binary().subtype()));
+            for (const auto byte : j.get_binary()) {
+                seed = combine(seed, std::hash<std::uint8_t>{}(byte));
+            }
+            return seed;
+        }
+
+        default:                 // LCOV_EXCL_LINE
+            JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            return 0;            // LCOV_EXCL_LINE
+    }
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>  // generate_n
+#include <array>      // array
+#include <cmath>      // ldexp
+#include <cstddef>    // size_t
+#include <cstdint>    // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio>     // snprintf
+#include <cstring>    // memcpy
+#include <iterator>   // back_inserter
+#include <limits>     // numeric_limits
+#include <string>     // char_traits, string
+#include <utility>    // make_pair, move
+#include <vector>     // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <array>        // array
+#include <cstddef>      // size_t
+#include <cstring>      // strlen
+#include <iterator>     // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory>       // shared_ptr, make_shared, addressof
+#include <numeric>      // accumulate
+#include <string>       // string, char_traits
+#include <type_traits>  // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility>      // pair, declval
+
+#ifndef JSON_NO_IO
+#include <cstdio>   // FILE *
+#include <istream>  // istream
+#endif              // JSON_NO_IO
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson, bson, bjdata };
+
+////////////////////
+// input adapters //
+////////////////////
+
+#ifndef JSON_NO_IO
+/*!
+Input adapter for stdio file access. This adapter read only 1 byte and do not use any
+ buffer. This adapter is a very low level adapter.
+*/
+class file_input_adapter {
+   public:
+    using char_type = char;
+
+    JSON_HEDLEY_NON_NULL(2)
+    explicit file_input_adapter(std::FILE* f) noexcept : m_file(f) { JSON_ASSERT(m_file != nullptr); }
+
+    // make class move-only
+    file_input_adapter(const file_input_adapter&)     = delete;
+    file_input_adapter(file_input_adapter&&) noexcept = default;
+    file_input_adapter&
+    operator=(const file_input_adapter&) = delete;
+    file_input_adapter&
+    operator=(file_input_adapter&&) = delete;
+    ~file_input_adapter()           = default;
+
+    std::char_traits<char>::int_type
+    get_character() noexcept {
+        return std::fgetc(m_file);
+    }
+
+   private:
+    /// the file pointer to read from
+    std::FILE* m_file;
+};
+
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter {
+   public:
+    using char_type = char;
+
+    ~input_stream_adapter() {
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags, except eof
+        if (is != nullptr) {
+            is->clear(is->rdstate() & std::ios::eofbit);
+        }
+    }
+
+    explicit input_stream_adapter(std::istream& i) : is(&i), sb(i.rdbuf()) {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter&
+    operator=(input_stream_adapter&) = delete;
+    input_stream_adapter&
+    operator=(input_stream_adapter&&) = delete;
+
+    input_stream_adapter(input_stream_adapter&& rhs) noexcept : is(rhs.is), sb(rhs.sb) {
+        rhs.is = nullptr;
+        rhs.sb = nullptr;
+    }
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, e.g. 0xFFFFFFFF.
+    std::char_traits<char>::int_type
+    get_character() {
+        auto res = sb->sbumpc();
+        // set eof manually, as we don't use the istream interface.
+        if (JSON_HEDLEY_UNLIKELY(res == std::char_traits<char>::eof())) {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+   private:
+    /// the associated input stream
+    std::istream* is   = nullptr;
+    std::streambuf* sb = nullptr;
+};
+#endif  // JSON_NO_IO
+
+// General-purpose iterator-based adapter. It might not be as fast as
+// theoretically possible for some containers, but it is extremely versatile.
+template <typename IteratorType>
+class iterator_input_adapter {
+   public:
+    using char_type = typename std::iterator_traits<IteratorType>::value_type;
+
+    iterator_input_adapter(IteratorType first, IteratorType last) : current(std::move(first)), end(std::move(last)) {}
+
+    typename char_traits<char_type>::int_type
+    get_character() {
+        if (JSON_HEDLEY_LIKELY(current != end)) {
+            auto result = char_traits<char_type>::to_int_type(*current);
+            std::advance(current, 1);
+            return result;
+        }
+
+        return char_traits<char_type>::eof();
+    }
+
+   private:
+    IteratorType current;
+    IteratorType end;
+
+    template <typename BaseInputAdapter, size_t T>
+    friend struct wide_string_input_helper;
+
+    bool
+    empty() const {
+        return current == end;
+    }
+};
+
+template <typename BaseInputAdapter, size_t T>
+struct wide_string_input_helper;
+
+template <typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 4> {
+    // UTF-32
+    static void
+    fill_buffer(BaseInputAdapter& input,
+                std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                size_t& utf8_bytes_index,
+                size_t& utf8_bytes_filled) {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty())) {
+            utf8_bytes[0]     = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        } else {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80) {
+                utf8_bytes[0]     = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            } else if (wc <= 0x7FF) {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(
+                    0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
+                utf8_bytes[1] =
+                    static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            } else if (wc <= 0xFFFF) {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(
+                    0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(
+                    0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] =
+                    static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            } else if (wc <= 0x10FFFF) {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(
+                    0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(
+                    0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(
+                    0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[3] =
+                    static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 4;
+            } else {
+                // unknown character
+                utf8_bytes[0]     = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+};
+
+template <typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 2> {
+    // UTF-16
+    static void
+    fill_buffer(BaseInputAdapter& input,
+                std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                size_t& utf8_bytes_index,
+                size_t& utf8_bytes_filled) {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty())) {
+            utf8_bytes[0]     = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        } else {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80) {
+                utf8_bytes[0]     = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            } else if (wc <= 0x7FF) {
+                utf8_bytes[0] =
+                    static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
+                utf8_bytes[1] =
+                    static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            } else if (0xD800 > wc || wc >= 0xE000) {
+                utf8_bytes[0] =
+                    static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(
+                    0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] =
+                    static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            } else {
+                if (JSON_HEDLEY_UNLIKELY(!input.empty())) {
+                    const auto wc2 = static_cast<unsigned int>(input.get_character());
+                    const auto charcode =
+                        0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
+                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
+                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
+                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
+                    utf8_bytes_filled = 4;
+                } else {
+                    utf8_bytes[0]     = static_cast<std::char_traits<char>::int_type>(wc);
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+};
+
+// Wraps another input adapter to convert wide character types into individual bytes.
+template <typename BaseInputAdapter, typename WideCharType>
+class wide_string_input_adapter {
+   public:
+    using char_type = char;
+
+    wide_string_input_adapter(BaseInputAdapter base) : base_adapter(base) {}
+
+    typename std::char_traits<char>::int_type
+    get_character() noexcept {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled) {
+            fill_buffer<sizeof(WideCharType)>();
+
+            JSON_ASSERT(utf8_bytes_filled > 0);
+            JSON_ASSERT(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        JSON_ASSERT(utf8_bytes_filled > 0);
+        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+   private:
+    BaseInputAdapter base_adapter;
+
+    template <size_t T>
+    void
+    fill_buffer() {
+        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(
+            base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
+    }
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+};
+
+template <typename IteratorType, typename Enable = void>
+struct iterator_input_adapter_factory {
+    using iterator_type = IteratorType;
+    using char_type     = typename std::iterator_traits<iterator_type>::value_type;
+    using adapter_type  = iterator_input_adapter<iterator_type>;
+
+    static adapter_type
+    create(IteratorType first, IteratorType last) {
+        return adapter_type(std::move(first), std::move(last));
+    }
+};
+
+template <typename T>
+struct is_iterator_of_multibyte {
+    using value_type = typename std::iterator_traits<T>::value_type;
+    enum { value = sizeof(value_type) > 1 };
+};
+
+template <typename IteratorType>
+struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>> {
+    using iterator_type     = IteratorType;
+    using char_type         = typename std::iterator_traits<iterator_type>::value_type;
+    using base_adapter_type = iterator_input_adapter<iterator_type>;
+    using adapter_type      = wide_string_input_adapter<base_adapter_type, char_type>;
+
+    static adapter_type
+    create(IteratorType first, IteratorType last) {
+        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
+    }
+};
+
+// General purpose iterator-based input
+template <typename IteratorType>
+typename iterator_input_adapter_factory<IteratorType>::adapter_type
+input_adapter(IteratorType first, IteratorType last) {
+    using factory_type = iterator_input_adapter_factory<IteratorType>;
+    return factory_type::create(first, last);
+}
+
+// Convenience shorthand from container to iterator
+// Enables ADL on begin(container) and end(container)
+// Encloses the using declarations in namespace for not to leak them to outside scope
+
+namespace container_input_adapter_factory_impl {
+
+using std::begin;
+using std::end;
+
+template <typename ContainerType, typename Enable = void>
+struct container_input_adapter_factory {};
+
+template <typename ContainerType>
+struct container_input_adapter_factory<
+    ContainerType,
+    void_t<decltype(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>()))>> {
+    using adapter_type =
+        decltype(input_adapter(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>())));
+
+    static adapter_type
+    create(const ContainerType& container) {
+        return input_adapter(begin(container), end(container));
+    }
+};
+
+}  // namespace container_input_adapter_factory_impl
+
+template <typename ContainerType>
+typename container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::adapter_type
+input_adapter(const ContainerType& container) {
+    return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
+}
+
+#ifndef JSON_NO_IO
+// Special cases with fast paths
+inline file_input_adapter
+input_adapter(std::FILE* file) {
+    return file_input_adapter(file);
+}
+
+inline input_stream_adapter
+input_adapter(std::istream& stream) {
+    return input_stream_adapter(stream);
+}
+
+inline input_stream_adapter
+input_adapter(std::istream&& stream) {
+    return input_stream_adapter(stream);
+}
+#endif  // JSON_NO_IO
+
+using contiguous_bytes_input_adapter =
+    decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));
+
+// Null-delimited strings, and the like.
+template <typename CharT,
+          typename std::enable_if<std::is_pointer<CharT>::value && !std::is_array<CharT>::value &&
+                                      std::is_integral<typename std::remove_pointer<CharT>::type>::value &&
+                                      sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                                  int>::type = 0>
+contiguous_bytes_input_adapter
+input_adapter(CharT b) {
+    auto length     = std::strlen(reinterpret_cast<const char*>(b));
+    const auto* ptr = reinterpret_cast<const char*>(b);
+    return input_adapter(ptr, ptr + length);
+}
+
+template <typename T, std::size_t N>
+auto
+input_adapter(T (&array)[N]) -> decltype(input_adapter(
+    array,
+    array + N))  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+{
+    return input_adapter(array, array + N);
+}
+
+// This class only handles inputs of input_buffer_adapter type.
+// It's required so that expressions like {ptr, len} can be implicitly cast
+// to the correct adapter.
+class span_input_adapter {
+   public:
+    template <typename CharT,
+              typename std::enable_if<std::is_pointer<CharT>::value &&
+                                          std::is_integral<typename std::remove_pointer<CharT>::type>::value &&
+                                          sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                                      int>::type = 0>
+    span_input_adapter(CharT b, std::size_t l)
+        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}
+
+    template <class IteratorType,
+              typename std::enable_if<std::is_same<typename iterator_traits<IteratorType>::iterator_category,
+                                                   std::random_access_iterator_tag>::value,
+                                      int>::type = 0>
+    span_input_adapter(IteratorType first, IteratorType last) : ia(input_adapter(first, last)) {}
+
+    contiguous_bytes_input_adapter&&
+    get() {
+        return std::move(ia);  // NOLINT(hicpp-move-const-arg,performance-move-const-arg)
+    }
+
+   private:
+    contiguous_bytes_input_adapter ia;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstddef>
+#include <string>   // string
+#include <utility>  // move
+#include <vector>   // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template <typename BasicJsonType>
+struct json_sax {
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool
+    null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool
+    boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool
+    number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool
+    number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief a floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool
+    number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string value was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string value.
+    */
+    virtual bool
+    string(string_t& val) = 0;
+
+    /*!
+    @brief a binary value was read
+    @param[in] val  binary value
+    @return whether parsing should proceed
+    @note It is safe to move the passed binary value.
+    */
+    virtual bool
+    binary(binary_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool
+    start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool
+    key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool
+    end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool
+    start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool
+    end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool
+    parse_error(std::size_t position, const std::string& last_token, const detail::exception& ex) = 0;
+
+    json_sax()                    = default;
+    json_sax(const json_sax&)     = default;
+    json_sax(json_sax&&) noexcept = default;
+    json_sax&
+    operator=(const json_sax&) = default;
+    json_sax&
+    operator=(json_sax&&) noexcept = default;
+    virtual ~json_sax()            = default;
+};
+
+namespace detail {
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template <typename BasicJsonType>
+class json_sax_dom_parser {
+   public:
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+
+    /*!
+    @param[in,out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
+        : root(r), allow_exceptions(allow_exceptions_) {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) =
+        default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_parser&
+    operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser&
+    operator=(json_sax_dom_parser&&) = default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_parser()           = default;
+
+    bool
+    null() {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool
+    boolean(bool val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    number_integer(number_integer_t val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    number_unsigned(number_unsigned_t val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    number_float(number_float_t val, const string_t& /*unused*/) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    string(string_t& val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    binary(binary_t& val) {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool
+    start_object(std::size_t len) {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+        if (JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size())) {
+            JSON_THROW(
+                out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool
+    key(string_t& val) {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_object());
+
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val));
+        return true;
+    }
+
+    bool
+    end_object() {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_object());
+
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool
+    start_array(std::size_t len) {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+        if (JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size())) {
+            JSON_THROW(
+                out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool
+    end_array() {
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(ref_stack.back()->is_array());
+
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    template <class Exception>
+    bool
+    parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const Exception& ex) {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions) {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool
+    is_errored() const {
+        return errored;
+    }
+
+   private:
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template <typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL BasicJsonType*
+    handle_value(Value&& v) {
+        if (ref_stack.empty()) {
+            root = BasicJsonType(std::forward<Value>(v));
+            return &root;
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array()) {
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::forward<Value>(v));
+            return &(ref_stack.back()->m_data.m_value.array->back());
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_object());
+        JSON_ASSERT(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack{};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+template <typename BasicJsonType>
+class json_sax_dom_callback_parser {
+   public:
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t     = typename BasicJsonType::parse_event_t;
+
+    json_sax_dom_callback_parser(BasicJsonType& r, const parser_callback_t cb, const bool allow_exceptions_ = true)
+        : root(r), callback(cb), allow_exceptions(allow_exceptions_) {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) =
+        default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_callback_parser&
+    operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser&
+    operator=(json_sax_dom_callback_parser&&) =
+        default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_callback_parser() = default;
+
+    bool
+    null() {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool
+    boolean(bool val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    number_integer(number_integer_t val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    number_unsigned(number_unsigned_t val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    number_float(number_float_t val, const string_t& /*unused*/) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    string(string_t& val) {
+        handle_value(val);
+        return true;
+    }
+
+    bool
+    binary(binary_t& val) {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool
+    start_object(std::size_t len) {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        // check object limit
+        if (ref_stack.back() &&
+            JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size())) {
+            JSON_THROW(
+                out_of_range::create(408, concat("excessive object size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool
+    key(string_t& val) {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep && ref_stack.back()) {
+            object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool
+    end_object() {
+        if (ref_stack.back()) {
+            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back())) {
+                // discard object
+                *ref_stack.back() = discarded;
+            } else {
+                ref_stack.back()->set_parents();
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured()) {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it) {
+                if (it->is_discarded()) {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool
+    start_array(std::size_t len) {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        // check array limit
+        if (ref_stack.back() &&
+            JSON_HEDLEY_UNLIKELY(len != static_cast<std::size_t>(-1) && len > ref_stack.back()->max_size())) {
+            JSON_THROW(
+                out_of_range::create(408, concat("excessive array size: ", std::to_string(len)), ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool
+    end_array() {
+        bool keep = true;
+
+        if (ref_stack.back()) {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (keep) {
+                ref_stack.back()->set_parents();
+            } else {
+                // discard array
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array()) {
+            ref_stack.back()->m_data.m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    template <class Exception>
+    bool
+    parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const Exception& ex) {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions) {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool
+    is_errored() const {
+        return errored;
+    }
+
+   private:
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template <typename Value>
+    std::pair<bool, BasicJsonType*>
+    handle_value(Value&& v, const bool skip_callback = false) {
+        JSON_ASSERT(!keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (!keep_stack.back()) {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+        // check callback
+        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (!keep) {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty()) {
+            root = std::move(value);
+            return {true, &root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (!ref_stack.back()) {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array()) {
+            ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value));
+            return {true, &(ref_stack.back()->m_data.m_value.array->back())};
+        }
+
+        // object
+        JSON_ASSERT(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        JSON_ASSERT(!key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (!store_element) {
+            return {false, nullptr};
+        }
+
+        JSON_ASSERT(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack{};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack{};
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack{};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+};
+
+template <typename BasicJsonType>
+class json_sax_acceptor {
+   public:
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+
+    bool
+    null() {
+        return true;
+    }
+
+    bool
+    boolean(bool /*unused*/) {
+        return true;
+    }
+
+    bool
+    number_integer(number_integer_t /*unused*/) {
+        return true;
+    }
+
+    bool
+    number_unsigned(number_unsigned_t /*unused*/) {
+        return true;
+    }
+
+    bool
+    number_float(number_float_t /*unused*/, const string_t& /*unused*/) {
+        return true;
+    }
+
+    bool
+    string(string_t& /*unused*/) {
+        return true;
+    }
+
+    bool
+    binary(binary_t& /*unused*/) {
+        return true;
+    }
+
+    bool
+    start_object(std::size_t /*unused*/ = static_cast<std::size_t>(-1)) {
+        return true;
+    }
+
+    bool
+    key(string_t& /*unused*/) {
+        return true;
+    }
+
+    bool
+    end_object() {
+        return true;
+    }
+
+    bool
+    start_array(std::size_t /*unused*/ = static_cast<std::size_t>(-1)) {
+        return true;
+    }
+
+    bool
+    end_array() {
+        return true;
+    }
+
+    bool
+    parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/) {
+        return false;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/lexer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <array>             // array
+#include <clocale>           // localeconv
+#include <cstddef>           // size_t
+#include <cstdio>            // snprintf
+#include <cstdlib>           // strtof, strtod, strtold, strtoll, strtoull
+#include <initializer_list>  // initializer_list
+#include <string>            // char_traits, string
+#include <utility>           // move
+#include <vector>            // vector
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+///////////
+// lexer //
+///////////
+
+template <typename BasicJsonType>
+class lexer_base {
+   public:
+    /// token types for the parser
+    enum class token_type {
+        uninitialized,    ///< indicating the scanner is uninitialized
+        literal_true,     ///< the `true` literal
+        literal_false,    ///< the `false` literal
+        literal_null,     ///< the `null` literal
+        value_string,     ///< a string -- use get_string() for actual value
+        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
+        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
+        value_float,      ///< an floating point number -- use get_number_float() for actual value
+        begin_array,      ///< the character for array begin `[`
+        begin_object,     ///< the character for object begin `{`
+        end_array,        ///< the character for array end `]`
+        end_object,       ///< the character for object end `}`
+        name_separator,   ///< the name separator `:`
+        value_separator,  ///< the value separator `,`
+        parse_error,      ///< indicating a parse error
+        end_of_input,     ///< indicating the end of the input buffer
+        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
+    };
+
+    /// return name of values of type token_type (only used for errors)
+    JSON_HEDLEY_RETURNS_NON_NULL
+    JSON_HEDLEY_CONST
+    static const char*
+    token_type_name(const token_type t) noexcept {
+        switch (t) {
+            case token_type::uninitialized:
+                return "<uninitialized>";
+            case token_type::literal_true:
+                return "true literal";
+            case token_type::literal_false:
+                return "false literal";
+            case token_type::literal_null:
+                return "null literal";
+            case token_type::value_string:
+                return "string literal";
+            case token_type::value_unsigned:
+            case token_type::value_integer:
+            case token_type::value_float:
+                return "number literal";
+            case token_type::begin_array:
+                return "'['";
+            case token_type::begin_object:
+                return "'{'";
+            case token_type::end_array:
+                return "']'";
+            case token_type::end_object:
+                return "'}'";
+            case token_type::name_separator:
+                return "':'";
+            case token_type::value_separator:
+                return "','";
+            case token_type::parse_error:
+                return "<parse error>";
+            case token_type::end_of_input:
+                return "end of input";
+            case token_type::literal_or_value:
+                return "'[', '{', or a literal";
+            // LCOV_EXCL_START
+            default:  // catch non-enum values
+                return "unknown token";
+                // LCOV_EXCL_STOP
+        }
+    }
+};
+/*!
+@brief lexical analysis
+
+This class organizes the lexical analysis during JSON deserialization.
+*/
+template <typename BasicJsonType, typename InputAdapterType>
+class lexer : public lexer_base<BasicJsonType> {
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using char_type         = typename InputAdapterType::char_type;
+    using char_int_type     = typename char_traits<char_type>::int_type;
+
+   public:
+    using token_type = typename lexer_base<BasicJsonType>::token_type;
+
+    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
+        : ia(std::move(adapter)),
+          ignore_comments(ignore_comments_),
+          decimal_point_char(static_cast<char_int_type>(get_decimal_point())) {}
+
+    // delete because of pointer members
+    lexer(const lexer&) = delete;
+    lexer(lexer&&)      = default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    lexer&
+    operator=(lexer&) = delete;
+    lexer&
+    operator=(lexer&&) = default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~lexer()           = default;
+
+   private:
+    /////////////////////
+    // locales
+    /////////////////////
+
+    /// return the locale-dependent decimal point
+    JSON_HEDLEY_PURE
+    static char
+    get_decimal_point() noexcept {
+        const auto* loc = localeconv();
+        JSON_ASSERT(loc != nullptr);
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
+    }
+
+    /////////////////////
+    // scan functions
+    /////////////////////
+
+    /*!
+    @brief get codepoint from 4 hex characters following `\u`
+
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
+    */
+    int
+    get_codepoint() {
+        // this function only makes sense after reading `\u`
+        JSON_ASSERT(current == 'u');
+        int codepoint = 0;
+
+        const auto factors = {12u, 8u, 4u, 0u};
+        for (const auto factor : factors) {
+            get();
+
+            if (current >= '0' && current <= '9') {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
+            } else if (current >= 'A' && current <= 'F') {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
+            } else if (current >= 'a' && current <= 'f') {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
+            } else {
+                return -1;
+            }
+        }
+
+        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
+        return codepoint;
+    }
+
+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
+    @return true if and only if no range violation was detected
+    */
+    bool
+    next_byte_in_range(std::initializer_list<char_int_type> ranges) {
+        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
+        add(current);
+
+        for (auto range = ranges.begin(); range != ranges.end(); ++range) {
+            get();
+            if (JSON_HEDLEY_LIKELY(*range <= current &&
+                                   current <= *(++range)))  // NOLINT(bugprone-inc-dec-in-conditions)
+            {
+                add(current);
+            } else {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief scan a string literal
+
+    This function scans a string according to Sect. 7 of RFC 8259. While
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
+
+    @return token_type::value_string if string could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note In case of errors, variable error_message contains a textual
+          description.
+    */
+    token_type
+    scan_string() {
+        // reset token_buffer (ignore opening quote)
+        reset();
+
+        // we entered the function by reading an open quote
+        JSON_ASSERT(current == '\"');
+
+        while (true) {
+            // get next character
+            switch (get()) {
+                // end of file while parsing string
+                case char_traits<char_type>::eof(): {
+                    error_message = "invalid string: missing closing quote";
+                    return token_type::parse_error;
+                }
+
+                // closing quote
+                case '\"': {
+                    return token_type::value_string;
+                }
+
+                // escapes
+                case '\\': {
+                    switch (get()) {
+                        // quotation mark
+                        case '\"':
+                            add('\"');
+                            break;
+                        // reverse solidus
+                        case '\\':
+                            add('\\');
+                            break;
+                        // solidus
+                        case '/':
+                            add('/');
+                            break;
+                        // backspace
+                        case 'b':
+                            add('\b');
+                            break;
+                        // form feed
+                        case 'f':
+                            add('\f');
+                            break;
+                        // line feed
+                        case 'n':
+                            add('\n');
+                            break;
+                        // carriage return
+                        case 'r':
+                            add('\r');
+                            break;
+                        // tab
+                        case 't':
+                            add('\t');
+                            break;
+
+                        // unicode escapes
+                        case 'u': {
+                            const int codepoint1 = get_codepoint();
+                            int codepoint        = codepoint1;  // start with codepoint1
+
+                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) {
+                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                return token_type::parse_error;
+                            }
+
+                            // check if code point is a high surrogate
+                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) {
+                                // expect next \uxxxx entry
+                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) {
+                                    const int codepoint2 = get_codepoint();
+
+                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) {
+                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                        return token_type::parse_error;
+                                    }
+
+                                    // check if codepoint2 is a low surrogate
+                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) {
+                                        // overwrite codepoint
+                                        codepoint = static_cast<int>(
+                                            // high surrogate occupies the most significant 22 bits
+                                            (static_cast<unsigned int>(codepoint1) << 10u)
+                                            // low surrogate occupies the least significant 15 bits
+                                            + static_cast<unsigned int>(codepoint2)
+                                            // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                                            // in the result, so we have to subtract with:
+                                            // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                                            - 0x35FDC00u);
+                                    } else {
+                                        error_message =
+                                            "invalid string: surrogate U+D800..U+DBFF must be followed by "
+                                            "U+DC00..U+DFFF";
+                                        return token_type::parse_error;
+                                    }
+                                } else {
+                                    error_message =
+                                        "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                    return token_type::parse_error;
+                                }
+                            } else {
+                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) {
+                                    error_message =
+                                        "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+
+                            // result of the above calculation yields a proper codepoint
+                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
+
+                            // translate codepoint into bytes
+                            if (codepoint < 0x80) {
+                                // 1-byte characters: 0xxxxxxx (ASCII)
+                                add(static_cast<char_int_type>(codepoint));
+                            } else if (codepoint <= 0x7FF) {
+                                // 2-byte characters: 110xxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            } else if (codepoint <= 0xFFFF) {
+                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
+                                add(static_cast<char_int_type>(0x80u |
+                                                               ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            } else {
+                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
+                                add(static_cast<char_int_type>(
+                                    0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u |
+                                                               ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+
+                            break;
+                        }
+
+                        // other characters after escape
+                        default:
+                            error_message = "invalid string: forbidden character after backslash";
+                            return token_type::parse_error;
+                    }
+
+                    break;
+                }
+
+                // invalid control characters
+                case 0x00: {
+                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
+                    return token_type::parse_error;
+                }
+
+                case 0x01: {
+                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
+                    return token_type::parse_error;
+                }
+
+                case 0x02: {
+                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
+                    return token_type::parse_error;
+                }
+
+                case 0x03: {
+                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
+                    return token_type::parse_error;
+                }
+
+                case 0x04: {
+                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
+                    return token_type::parse_error;
+                }
+
+                case 0x05: {
+                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
+                    return token_type::parse_error;
+                }
+
+                case 0x06: {
+                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
+                    return token_type::parse_error;
+                }
+
+                case 0x07: {
+                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
+                    return token_type::parse_error;
+                }
+
+                case 0x08: {
+                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
+                    return token_type::parse_error;
+                }
+
+                case 0x09: {
+                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
+                    return token_type::parse_error;
+                }
+
+                case 0x0A: {
+                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
+                    return token_type::parse_error;
+                }
+
+                case 0x0B: {
+                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
+                    return token_type::parse_error;
+                }
+
+                case 0x0C: {
+                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
+                    return token_type::parse_error;
+                }
+
+                case 0x0D: {
+                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
+                    return token_type::parse_error;
+                }
+
+                case 0x0E: {
+                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
+                    return token_type::parse_error;
+                }
+
+                case 0x0F: {
+                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
+                    return token_type::parse_error;
+                }
+
+                case 0x10: {
+                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
+                    return token_type::parse_error;
+                }
+
+                case 0x11: {
+                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
+                    return token_type::parse_error;
+                }
+
+                case 0x12: {
+                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
+                    return token_type::parse_error;
+                }
+
+                case 0x13: {
+                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
+                    return token_type::parse_error;
+                }
+
+                case 0x14: {
+                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
+                    return token_type::parse_error;
+                }
+
+                case 0x15: {
+                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
+                    return token_type::parse_error;
+                }
+
+                case 0x16: {
+                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
+                    return token_type::parse_error;
+                }
+
+                case 0x17: {
+                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
+                    return token_type::parse_error;
+                }
+
+                case 0x18: {
+                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
+                    return token_type::parse_error;
+                }
+
+                case 0x19: {
+                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
+                    return token_type::parse_error;
+                }
+
+                case 0x1A: {
+                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
+                    return token_type::parse_error;
+                }
+
+                case 0x1B: {
+                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
+                    return token_type::parse_error;
+                }
+
+                case 0x1C: {
+                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
+                    return token_type::parse_error;
+                }
+
+                case 0x1D: {
+                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
+                    return token_type::parse_error;
+                }
+
+                case 0x1E: {
+                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
+                    return token_type::parse_error;
+                }
+
+                case 0x1F: {
+                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
+                    return token_type::parse_error;
+                }
+
+                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
+                case 0x20:
+                case 0x21:
+                case 0x23:
+                case 0x24:
+                case 0x25:
+                case 0x26:
+                case 0x27:
+                case 0x28:
+                case 0x29:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
+                case 0x30:
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x34:
+                case 0x35:
+                case 0x36:
+                case 0x37:
+                case 0x38:
+                case 0x39:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
+                case 0x40:
+                case 0x41:
+                case 0x42:
+                case 0x43:
+                case 0x44:
+                case 0x45:
+                case 0x46:
+                case 0x47:
+                case 0x48:
+                case 0x49:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
+                case 0x50:
+                case 0x51:
+                case 0x52:
+                case 0x53:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                case 0x60:
+                case 0x61:
+                case 0x62:
+                case 0x63:
+                case 0x64:
+                case 0x65:
+                case 0x66:
+                case 0x67:
+                case 0x68:
+                case 0x69:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
+                case 0x70:
+                case 0x71:
+                case 0x72:
+                case 0x73:
+                case 0x74:
+                case 0x75:
+                case 0x76:
+                case 0x77:
+                case 0x78:
+                case 0x79:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F: {
+                    add(current);
+                    break;
+                }
+
+                // U+0080..U+07FF: bytes C2..DF 80..BF
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF: {
+                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
+                case 0xE0: {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
+                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF: {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
+                case 0xED: {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+                case 0xF0: {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+                case 0xF1:
+                case 0xF2:
+                case 0xF3: {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+                case 0xF4: {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // remaining bytes (80..C1 and F5..FF) are ill-formed
+                default: {
+                    error_message = "invalid string: ill-formed UTF-8 byte";
+                    return token_type::parse_error;
+                }
+            }
+        }
+    }
+
+    /*!
+     * @brief scan a comment
+     * @return whether comment could be scanned successfully
+     */
+    bool
+    scan_comment() {
+        switch (get()) {
+            // single-line comments skip input until a newline or EOF is read
+            case '/': {
+                while (true) {
+                    switch (get()) {
+                        case '\n':
+                        case '\r':
+                        case char_traits<char_type>::eof():
+                        case '\0':
+                            return true;
+
+                        default:
+                            break;
+                    }
+                }
+            }
+
+            // multi-line comments skip input until */ is read
+            case '*': {
+                while (true) {
+                    switch (get()) {
+                        case char_traits<char_type>::eof():
+                        case '\0': {
+                            error_message = "invalid comment; missing closing '*/'";
+                            return false;
+                        }
+
+                        case '*': {
+                            switch (get()) {
+                                case '/':
+                                    return true;
+
+                                default: {
+                                    unget();
+                                    continue;
+                                }
+                            }
+                        }
+
+                        default:
+                            continue;
+                    }
+                }
+            }
+
+            // unexpected character after reading '/'
+            default: {
+                error_message = "invalid comment; expecting '/' or '*' after '/'";
+                return false;
+            }
+        }
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void
+    strtof(float& f, const char* str, char** endptr) noexcept {
+        f = std::strtof(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void
+    strtof(double& f, const char* str, char** endptr) noexcept {
+        f = std::strtod(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void
+    strtof(long double& f, const char* str, char** endptr) noexcept {
+        f = std::strtold(str, endptr);
+    }
+
+    /*!
+    @brief scan a number literal
+
+    This function scans a string according to Sect. 6 of RFC 8259.
+
+    The function is realized with a deterministic finite state machine derived
+    from the grammar described in RFC 8259. Starting in state "init", the
+    input is read and used to determined the next state. Only state "done"
+    accepts the number. State "error" is a trap state to model errors. In the
+    table below, "anything" means any character but the ones listed before.
+
+    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
+    ---------|----------|----------|----------|---------|---------|----------|-----------
+    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
+    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
+    zero     | done     | done     | exponent | done    | done    | decimal1 | done
+    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
+    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
+    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
+    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
+    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
+    any2     | any2     | any2     | done     | done    | done    | done     | done
+
+    The state machine is realized with one label per state (prefixed with
+    "scan_number_") and `goto` statements between them. The state machine
+    contains cycles, but any cycle can be left when EOF is read. Therefore,
+    the function is guaranteed to terminate.
+
+    During scanning, the read bytes are stored in token_buffer. This string is
+    then converted to a signed integer, an unsigned integer, or a
+    floating-point number.
+
+    @return token_type::value_unsigned, token_type::value_integer, or
+            token_type::value_float if number could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note The scanner is independent of the current locale. Internally, the
+          locale's decimal point is used instead of `.` to work with the
+          locale-dependent converters.
+    */
+    token_type
+    scan_number()  // lgtm [cpp/use-of-goto]
+    {
+        // reset token_buffer to store the number's bytes
+        reset();
+
+        // the type of the parsed number; initially set to unsigned; will be
+        // changed if minus sign, decimal point or exponent is read
+        token_type number_type = token_type::value_unsigned;
+
+        // state (init): we just found out we need to scan a number
+        switch (current) {
+            case '-': {
+                add(current);
+                goto scan_number_minus;
+            }
+
+            case '0': {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            // all other characters are rejected outside scan_number()
+            default:                 // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+    scan_number_minus:
+        // state: we just parsed a leading minus sign
+        number_type = token_type::value_integer;
+        switch (get()) {
+            case '0': {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            default: {
+                error_message = "invalid number; expected digit after '-'";
+                return token_type::parse_error;
+            }
+        }
+
+    scan_number_zero:
+        // state: we just parse a zero (maybe with a leading minus sign)
+        switch (get()) {
+            case '.': {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E': {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+    scan_number_any1:
+        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
+        switch (get()) {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            case '.': {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E': {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+    scan_number_decimal1:
+        // state: we just parsed a decimal point
+        number_type = token_type::value_float;
+        switch (get()) {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            default: {
+                error_message = "invalid number; expected digit after '.'";
+                return token_type::parse_error;
+            }
+        }
+
+    scan_number_decimal2:
+        // we just parsed at least one number after a decimal point
+        switch (get()) {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            case 'e':
+            case 'E': {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+    scan_number_exponent:
+        // we just parsed an exponent
+        number_type = token_type::value_float;
+        switch (get()) {
+            case '+':
+            case '-': {
+                add(current);
+                goto scan_number_sign;
+            }
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default: {
+                error_message = "invalid number; expected '+', '-', or digit after exponent";
+                return token_type::parse_error;
+            }
+        }
+
+    scan_number_sign:
+        // we just parsed an exponent sign
+        switch (get()) {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default: {
+                error_message = "invalid number; expected digit after exponent sign";
+                return token_type::parse_error;
+            }
+        }
+
+    scan_number_any2:
+        // we just parsed a number after the exponent or exponent sign
+        switch (get()) {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9': {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+    scan_number_done:
+        // unget the character after the number (we only read it to know that
+        // we are done scanning a number)
+        unget();
+
+        char* endptr = nullptr;  // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        errno        = 0;
+
+        // try to parse integers first and fall back to floats
+        if (number_type == token_type::value_unsigned) {
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0) {
+                value_unsigned = static_cast<number_unsigned_t>(x);
+                if (value_unsigned == x) {
+                    return token_type::value_unsigned;
+                }
+            }
+        } else if (number_type == token_type::value_integer) {
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0) {
+                value_integer = static_cast<number_integer_t>(x);
+                if (value_integer == x) {
+                    return token_type::value_integer;
+                }
+            }
+        }
+
+        // this code is reached if we parse a floating-point number or if an
+        // integer conversion above failed
+        strtof(value_float, token_buffer.data(), &endptr);
+
+        // we checked the number format before
+        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+        return token_type::value_float;
+    }
+
+    /*!
+    @param[in] literal_text  the literal text to expect
+    @param[in] length        the length of the passed literal text
+    @param[in] return_type   the token type to return on success
+    */
+    JSON_HEDLEY_NON_NULL(2)
+    token_type
+    scan_literal(const char_type* literal_text, const std::size_t length, token_type return_type) {
+        JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]);
+        for (std::size_t i = 1; i < length; ++i) {
+            if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i])) {
+                error_message = "invalid literal";
+                return token_type::parse_error;
+            }
+        }
+        return return_type;
+    }
+
+    /////////////////////
+    // input management
+    /////////////////////
+
+    /// reset token_buffer; current character is beginning of token
+    void
+    reset() noexcept {
+        token_buffer.clear();
+        token_string.clear();
+        token_string.push_back(char_traits<char_type>::to_char_type(current));
+    }
+
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    char_int_type
+    get() {
+        ++position.chars_read_total;
+        ++position.chars_read_current_line;
+
+        if (next_unget) {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        } else {
+            current = ia.get_character();
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof())) {
+            token_string.push_back(char_traits<char_type>::to_char_type(current));
+        }
+
+        if (current == '\n') {
+            ++position.lines_read;
+            position.chars_read_current_line = 0;
+        }
+
+        return current;
+    }
+
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read_total,
+    chars_read_current_line, and token_string. The next call to get() will
+    behave as if the unget character is read again.
+    */
+    void
+    unget() {
+        next_unget = true;
+
+        --position.chars_read_total;
+
+        // in case we "unget" a newline, we have to also decrement the lines_read
+        if (position.chars_read_current_line == 0) {
+            if (position.lines_read > 0) {
+                --position.lines_read;
+            }
+        } else {
+            --position.chars_read_current_line;
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof())) {
+            JSON_ASSERT(!token_string.empty());
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
+    void
+    add(char_int_type c) {
+        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
+    }
+
+   public:
+    /////////////////////
+    // value getters
+    /////////////////////
+
+    /// return integer value
+    constexpr number_integer_t
+    get_number_integer() const noexcept {
+        return value_integer;
+    }
+
+    /// return unsigned integer value
+    constexpr number_unsigned_t
+    get_number_unsigned() const noexcept {
+        return value_unsigned;
+    }
+
+    /// return floating-point value
+    constexpr number_float_t
+    get_number_float() const noexcept {
+        return value_float;
+    }
+
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t&
+    get_string() {
+        return token_buffer;
+    }
+
+    /////////////////////
+    // diagnostics
+    /////////////////////
+
+    /// return position of last read token
+    constexpr position_t
+    get_position() const noexcept {
+        return position;
+    }
+
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
+    std::string
+    get_token_string() const {
+        // escape control characters
+        std::string result;
+        for (const auto c : token_string) {
+            if (static_cast<unsigned char>(c) <= '\x1F') {
+                // escape control characters
+                std::array<char, 9> cs{{}};
+                static_cast<void>((std::snprintf)(
+                    cs.data(),
+                    cs.size(),
+                    "<U+%.4X>",
+                    static_cast<unsigned char>(c)));  // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                result += cs.data();
+            } else {
+                // add character as is
+                result.push_back(static_cast<std::string::value_type>(c));
+            }
+        }
+
+        return result;
+    }
+
+    /// return syntax error message
+    JSON_HEDLEY_RETURNS_NON_NULL
+    constexpr const char*
+    get_error_message() const noexcept {
+        return error_message;
+    }
+
+    /////////////////////
+    // actual scanner
+    /////////////////////
+
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool
+    skip_bom() {
+        if (get() == 0xEF) {
+            // check if we completely parse the BOM
+            return get() == 0xBB && get() == 0xBF;
+        }
+
+        // the first character is not the beginning of the BOM; unget it to
+        // process is later
+        unget();
+        return true;
+    }
+
+    void
+    skip_whitespace() {
+        do {
+            get();
+        } while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
+    }
+
+    token_type
+    scan() {
+        // initially, skip the BOM
+        if (position.chars_read_total == 0 && !skip_bom()) {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
+        // read next character and ignore whitespace
+        skip_whitespace();
+
+        // ignore comments
+        while (ignore_comments && current == '/') {
+            if (!scan_comment()) {
+                return token_type::parse_error;
+            }
+
+            // skip following whitespace
+            skip_whitespace();
+        }
+
+        switch (current) {
+            // structural characters
+            case '[':
+                return token_type::begin_array;
+            case ']':
+                return token_type::end_array;
+            case '{':
+                return token_type::begin_object;
+            case '}':
+                return token_type::end_object;
+            case ':':
+                return token_type::name_separator;
+            case ',':
+                return token_type::value_separator;
+
+            // literals
+            case 't': {
+                std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'),
+                                                          static_cast<char_type>('r'),
+                                                          static_cast<char_type>('u'),
+                                                          static_cast<char_type>('e')}};
+                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
+            }
+            case 'f': {
+                std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'),
+                                                           static_cast<char_type>('a'),
+                                                           static_cast<char_type>('l'),
+                                                           static_cast<char_type>('s'),
+                                                           static_cast<char_type>('e')}};
+                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
+            }
+            case 'n': {
+                std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'),
+                                                          static_cast<char_type>('u'),
+                                                          static_cast<char_type>('l'),
+                                                          static_cast<char_type>('l')}};
+                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
+            }
+
+            // string
+            case '\"':
+                return scan_string();
+
+            // number
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                return scan_number();
+
+            // end of input (the null byte is needed when parsing from
+            // string literals)
+            case '\0':
+            case char_traits<char_type>::eof():
+                return token_type::end_of_input;
+
+            // error
+            default:
+                error_message = "invalid literal";
+                return token_type::parse_error;
+        }
+    }
+
+   private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// whether comments should be ignored (true) or signaled as errors (false)
+    const bool ignore_comments = false;
+
+    /// the current character
+    char_int_type current = char_traits<char_type>::eof();
+
+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
+    /// the start position of the current token
+    position_t position{};
+
+    /// raw input token string (for error messages)
+    std::vector<char_type> token_string{};
+
+    /// buffer for variable-length tokens (numbers, strings)
+    string_t token_buffer{};
+
+    /// a description of occurred lexer errors
+    const char* error_message = "";
+
+    // number values
+    number_integer_t value_integer   = 0;
+    number_unsigned_t value_unsigned = 0;
+    number_float_t value_float       = 0;
+
+    /// the decimal point
+    const char_int_type decimal_point_char = '.';
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstdint>  // size_t
+#include <utility>  // declval
+#include <string>   // string
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template <typename T>
+using boolean_function_t = decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template <typename T, typename Integer>
+using number_integer_function_t = decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template <typename T, typename Unsigned>
+using number_unsigned_function_t = decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template <typename T, typename Float, typename String>
+using number_float_function_t =
+    decltype(std::declval<T&>().number_float(std::declval<Float>(), std::declval<const String&>()));
+
+template <typename T, typename String>
+using string_function_t = decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template <typename T, typename Binary>
+using binary_function_t = decltype(std::declval<T&>().binary(std::declval<Binary&>()));
+
+template <typename T>
+using start_object_function_t = decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template <typename T, typename String>
+using key_function_t = decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template <typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template <typename T>
+using start_array_function_t = decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template <typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template <typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(std::declval<std::size_t>(),
+                                                                       std::declval<const std::string&>(),
+                                                                       std::declval<const Exception&>()));
+
+template <typename SAX, typename BasicJsonType>
+struct is_sax {
+   private:
+    static_assert(is_basic_json<BasicJsonType>::value, "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+    using exception_t       = typename BasicJsonType::exception;
+
+   public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template <typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts {
+   private:
+    static_assert(is_basic_json<BasicJsonType>::value, "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+    using exception_t       = typename BasicJsonType::exception;
+
+   public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value, "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value,
+                  "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value,
+                  "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool string(string_t&)");
+    static_assert(is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
+                  "Missing/invalid function: bool binary(binary_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+                  "Missing/invalid function: bool parse_error(std::size_t, const "
+                  "std::string&, const exception&)");
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/// how to treat CBOR tags
+enum class cbor_tag_handler_t {
+    error,   ///< throw a parse_error exception in case of a tag
+    ignore,  ///< ignore tags
+    store    ///< store tags as binary type
+};
+
+/*!
+@brief determine system byte order
+
+@return true if and only if system's byte order is little endian
+
+@note from https://stackoverflow.com/a/1001328/266378
+*/
+static inline bool
+little_endianness(int num = 1) noexcept {
+    return *reinterpret_cast<char*>(&num) == 1;
+}
+
+///////////////////
+// binary reader //
+///////////////////
+
+/*!
+@brief deserialization of CBOR, MessagePack, and UBJSON values
+*/
+template <typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
+class binary_reader {
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using binary_t          = typename BasicJsonType::binary_t;
+    using json_sax_t        = SAX;
+    using char_type         = typename InputAdapterType::char_type;
+    using char_int_type     = typename char_traits<char_type>::int_type;
+
+   public:
+    /*!
+    @brief create a binary reader
+
+    @param[in] adapter  input adapter to read from
+    */
+    explicit binary_reader(InputAdapterType&& adapter, const input_format_t format = input_format_t::json) noexcept
+        : ia(std::move(adapter)), input_format(format) {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType>{};
+    }
+
+    // make class move-only
+    binary_reader(const binary_reader&) = delete;
+    binary_reader(binary_reader&&)      = default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    binary_reader&
+    operator=(const binary_reader&) = delete;
+    binary_reader&
+    operator=(binary_reader&&) = default;  // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~binary_reader()           = default;
+
+    /*!
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
+    @param[in] strict  whether to expect the input to be consumed completed
+    @param[in] tag_handler  how to treat CBOR tags
+
+    @return whether parsing was successful
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool
+    sax_parse(const input_format_t format,
+              json_sax_t* sax_,
+              const bool strict                    = true,
+              const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) {
+        sax         = sax_;
+        bool result = false;
+
+        switch (format) {
+            case input_format_t::bson:
+                result = parse_bson_internal();
+                break;
+
+            case input_format_t::cbor:
+                result = parse_cbor_internal(true, tag_handler);
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+            case input_format_t::bjdata:
+                result = parse_ubjson_internal();
+                break;
+
+            case input_format_t::json:  // LCOV_EXCL_LINE
+            default:                    // LCOV_EXCL_LINE
+                JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+        // strict mode: next byte must be EOF
+        if (result && strict) {
+            if (input_format == input_format_t::ubjson || input_format == input_format_t::bjdata) {
+                get_ignore_noop();
+            } else {
+                get();
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(current != char_traits<char_type>::eof())) {
+                return sax->parse_error(
+                    chars_read,
+                    get_token_string(),
+                    parse_error::create(
+                        110,
+                        chars_read,
+                        exception_message(
+                            input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"),
+                        nullptr));
+            }
+        }
+
+        return result;
+    }
+
+   private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @brief Reads in a BSON-object and passes it to the SAX-parser.
+    @return whether a valid BSON-value was passed to the SAX parser
+    */
+    bool
+    parse_bson_internal() {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1)))) {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/ false))) {
+            return false;
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief Parses a C-style string from the BSON input.
+    @param[in,out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @return `true` if the \x00-byte indicating the end of the string was
+             encountered before the EOF; false` indicates an unexpected EOF.
+    */
+    bool
+    get_bson_cstr(string_t& result) {
+        auto out = std::back_inserter(result);
+        while (true) {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring"))) {
+                return false;
+            }
+            if (current == 0x00) {
+                return true;
+            }
+            *out++ = static_cast<typename string_t::value_type>(current);
+        }
+    }
+
+    /*!
+    @brief Parses a zero-terminated string of length @a len from the BSON
+           input.
+    @param[in] len  The length (including the zero-byte at the end) of the
+                    string to be read.
+    @param[in,out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 1
+    @return `true` if the string was successfully parsed
+    */
+    template <typename NumberType>
+    bool
+    get_bson_string(const NumberType len, string_t& result) {
+        if (JSON_HEDLEY_UNLIKELY(len < 1)) {
+            auto last_token = get_token_string();
+            return sax->parse_error(
+                chars_read,
+                last_token,
+                parse_error::create(
+                    112,
+                    chars_read,
+                    exception_message(input_format_t::bson,
+                                      concat("string length must be at least 1, is ", std::to_string(len)),
+                                      "string"),
+                    nullptr));
+        }
+
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) &&
+               get() != char_traits<char_type>::eof();
+    }
+
+    /*!
+    @brief Parses a byte array input of length @a len from the BSON input.
+    @param[in] len  The length of the byte array to be read.
+    @param[in,out] result  A reference to the binary variable where the read
+                            array is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 0
+    @return `true` if the byte array was successfully parsed
+    */
+    template <typename NumberType>
+    bool
+    get_bson_binary(const NumberType len, binary_t& result) {
+        if (JSON_HEDLEY_UNLIKELY(len < 0)) {
+            auto last_token = get_token_string();
+            return sax->parse_error(
+                chars_read,
+                last_token,
+                parse_error::create(
+                    112,
+                    chars_read,
+                    exception_message(input_format_t::bson,
+                                      concat("byte array length cannot be negative, is ", std::to_string(len)),
+                                      "binary"),
+                    nullptr));
+        }
+
+        // All BSON binary values have a subtype
+        std::uint8_t subtype{};
+        get_number<std::uint8_t>(input_format_t::bson, subtype);
+        result.set_subtype(subtype);
+
+        return get_binary(input_format_t::bson, len, result);
+    }
+
+    /*!
+    @brief Read a BSON document element of the given @a element_type.
+    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
+    @param[in] element_type_parse_position The position in the input stream,
+               where the `element_type` was read.
+    @warning Not all BSON element types are supported yet. An unsupported
+             @a element_type will give rise to a parse_error.114:
+             Unsupported BSON record type 0x...
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool
+    parse_bson_element_internal(const char_int_type element_type, const std::size_t element_type_parse_position) {
+        switch (element_type) {
+            case 0x01:  // double
+            {
+                double number{};
+                return get_number<double, true>(input_format_t::bson, number) &&
+                       sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0x02:  // string
+            {
+                std::int32_t len{};
+                string_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) &&
+                       sax->string(value);
+            }
+
+            case 0x03:  // object
+            {
+                return parse_bson_internal();
+            }
+
+            case 0x04:  // array
+            {
+                return parse_bson_array();
+            }
+
+            case 0x05:  // binary
+            {
+                std::int32_t len{};
+                binary_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) &&
+                       sax->binary(value);
+            }
+
+            case 0x08:  // boolean
+            {
+                return sax->boolean(get() != 0);
+            }
+
+            case 0x0A:  // null
+            {
+                return sax->null();
+            }
+
+            case 0x10:  // int32
+            {
+                std::int32_t value{};
+                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            case 0x12:  // int64
+            {
+                std::int64_t value{};
+                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            default:  // anything else not supported (yet)
+            {
+                std::array<char, 3> cr{{}};
+                static_cast<void>(
+                    (std::snprintf)(cr.data(),
+                                    cr.size(),
+                                    "%.2hhX",
+                                    static_cast<unsigned char>(
+                                        element_type)));  // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                const std::string cr_str{cr.data()};
+                return sax->parse_error(
+                    element_type_parse_position,
+                    cr_str,
+                    parse_error::create(
+                        114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief Read a BSON element list (as specified in the BSON-spec)
+
+    The same binary layout is used for objects and arrays, hence it must be
+    indicated with the argument @a is_array which one is expected
+    (true --> array, false --> object).
+
+    @param[in] is_array Determines if the element list being read is to be
+                        treated as an object (@a is_array == false), or as an
+                        array (@a is_array == true).
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool
+    parse_bson_element_list(const bool is_array) {
+        string_t key;
+
+        while (auto element_type = get()) {
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list"))) {
+                return false;
+            }
+
+            const std::size_t element_type_parse_position = chars_read;
+            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key))) {
+                return false;
+            }
+
+            if (!is_array && !sax->key(key)) {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position))) {
+                return false;
+            }
+
+            // get_bson_cstr only appends
+            key.clear();
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Reads an array from the BSON input and passes it to the SAX-parser.
+    @return whether a valid BSON-array was passed to the SAX parser
+    */
+    bool
+    parse_bson_array() {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1)))) {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/ true))) {
+            return false;
+        }
+
+        return sax->end_array();
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true) or whether the last read character should
+                         be considered instead (false)
+    @param[in] tag_handler how CBOR tags should be treated
+
+    @return whether a valid CBOR value was passed to the SAX parser
+    */
+    bool
+    parse_cbor_internal(const bool get_char, const cbor_tag_handler_t tag_handler) {
+        switch (get_char ? get() : current) {
+            // EOF
+            case char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::cbor, "value");
+
+            // Integer 0x00..0x17 (0..23)
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            case 0x18:  // Unsigned integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x19:  // Unsigned integer (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1A:  // Unsigned integer (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1B:  // Unsigned integer (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            // Negative integer -1-0x00..-1-0x17 (-1..-24)
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
+
+            case 0x38:  // Negative integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) &&
+                       sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x39:  // Negative integer -1-n (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) &&
+                       sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3A:  // Negative integer -1-n (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) &&
+                       sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3B:  // Negative integer -1-n (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) &&
+                       sax->number_integer(static_cast<number_integer_t>(-1) - static_cast<number_integer_t>(number));
+            }
+
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:  // Binary data (one-byte uint8_t for n follows)
+            case 0x59:  // Binary data (two-byte uint16_t for n follow)
+            case 0x5A:  // Binary data (four-byte uint32_t for n follow)
+            case 0x5B:  // Binary data (eight-byte uint64_t for n follow)
+            case 0x5F:  // Binary data (indefinite length)
+            {
+                binary_t b;
+                return get_cbor_binary(b) && sax->binary(b);
+            }
+
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:  // UTF-8 string (one-byte uint8_t for n follows)
+            case 0x79:  // UTF-8 string (two-byte uint16_t for n follow)
+            case 0x7A:  // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B:  // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F:  // UTF-8 string (indefinite length)
+            {
+                string_t s;
+                return get_cbor_string(s) && sax->string(s);
+            }
+
+            // array (0x00..0x17 data items follow)
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+                return get_cbor_array(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu),
+                                      tag_handler);
+
+            case 0x98:  // array (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x99:  // array (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9A:  // array (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9B:  // array (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_array(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9F:  // array (indefinite length)
+                return get_cbor_array(static_cast<std::size_t>(-1), tag_handler);
+
+            // map (0x00..0x17 pairs of data items follow)
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu),
+                                       tag_handler);
+
+            case 0xB8:  // map (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xB9:  // map (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBA:  // map (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBB:  // map (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_cbor_object(conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBF:  // map (indefinite length)
+                return get_cbor_object(static_cast<std::size_t>(-1), tag_handler);
+
+            case 0xC6:  // tagged item
+            case 0xC7:
+            case 0xC8:
+            case 0xC9:
+            case 0xCA:
+            case 0xCB:
+            case 0xCC:
+            case 0xCD:
+            case 0xCE:
+            case 0xCF:
+            case 0xD0:
+            case 0xD1:
+            case 0xD2:
+            case 0xD3:
+            case 0xD4:
+            case 0xD8:  // tagged item (1 bytes follow)
+            case 0xD9:  // tagged item (2 bytes follow)
+            case 0xDA:  // tagged item (4 bytes follow)
+            case 0xDB:  // tagged item (8 bytes follow)
+            {
+                switch (tag_handler) {
+                    case cbor_tag_handler_t::error: {
+                        auto last_token = get_token_string();
+                        return sax->parse_error(
+                            chars_read,
+                            last_token,
+                            parse_error::create(
+                                112,
+                                chars_read,
+                                exception_message(
+                                    input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"),
+                                nullptr));
+                    }
+
+                    case cbor_tag_handler_t::ignore: {
+                        // ignore binary subtype
+                        switch (current) {
+                            case 0xD8: {
+                                std::uint8_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xD9: {
+                                std::uint16_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xDA: {
+                                std::uint32_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xDB: {
+                                std::uint64_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            default:
+                                break;
+                        }
+                        return parse_cbor_internal(true, tag_handler);
+                    }
+
+                    case cbor_tag_handler_t::store: {
+                        binary_t b;
+                        // use binary subtype and store in binary container
+                        switch (current) {
+                            case 0xD8: {
+                                std::uint8_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(
+                                    detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xD9: {
+                                std::uint16_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(
+                                    detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xDA: {
+                                std::uint32_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(
+                                    detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xDB: {
+                                std::uint64_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(
+                                    detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            default:
+                                return parse_cbor_internal(true, tag_handler);
+                        }
+                        get();
+                        return get_cbor_binary(b) && sax->binary(b);
+                    }
+
+                    default:  // LCOV_EXCL_LINE
+                        JSON_ASSERT(
+                            false);    // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                        return false;  // LCOV_EXCL_LINE
+                }
+            }
+
+            case 0xF4:  // false
+                return sax->boolean(false);
+
+            case 0xF5:  // true
+                return sax->boolean(true);
+
+            case 0xF6:  // null
+                return sax->null();
+
+            case 0xF9:  // Half-Precision Float (two-byte IEEE 754)
+            {
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half  = static_cast<unsigned int>((byte1 << 8u) + byte2);
+                const double val = [&half] {
+                    const int exp           = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp && exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp) {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0) ? std::numeric_limits<double>::infinity()
+                                               : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float(
+                    (half & 0x8000u) != 0 ? static_cast<number_float_t>(-val) : static_cast<number_float_t>(val), "");
+            }
+
+            case 0xFA:  // Single-Precision Float (four-byte IEEE 754)
+            {
+                float number{};
+                return get_number(input_format_t::cbor, number) &&
+                       sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xFB:  // Double-Precision Float (eight-byte IEEE 754)
+            {
+                double number{};
+                return get_number(input_format_t::cbor, number) &&
+                       sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            default:  // anything else (0xFF is handled inside the other types)
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(
+                        112,
+                        chars_read,
+                        exception_message(input_format_t::cbor, concat("invalid byte: 0x", last_token), "value"),
+                        nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+    Additionally, CBOR's strings with indefinite lengths are supported.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool
+    get_cbor_string(string_t& result) {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string"))) {
+            return false;
+        }
+
+        switch (current) {
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77: {
+                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x78:  // UTF-8 string (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x79:  // UTF-8 string (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7A:  // UTF-8 string (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7B:  // UTF-8 string (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7F:  // UTF-8 string (indefinite length)
+            {
+                while (get() != 0xFF) {
+                    string_t chunk;
+                    if (!get_cbor_string(chunk)) {
+                        return false;
+                    }
+                    result.append(chunk);
+                }
+                return true;
+            }
+
+            default: {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(113,
+                                        chars_read,
+                                        exception_message(input_format_t::cbor,
+                                                          concat("expected length specification (0x60-0x7B) or "
+                                                                 "indefinite string type (0x7F); last byte: 0x",
+                                                                 last_token),
+                                                          "string"),
+                                        nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into the byte array.
+    Additionally, CBOR's byte arrays with indefinite lengths are supported.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool
+    get_cbor_binary(binary_t& result) {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary"))) {
+            return false;
+        }
+
+        switch (current) {
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57: {
+                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x58:  // Binary data (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x59:  // Binary data (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5A:  // Binary data (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5B:  // Binary data (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5F:  // Binary data (indefinite length)
+            {
+                while (get() != 0xFF) {
+                    binary_t chunk;
+                    if (!get_cbor_binary(chunk)) {
+                        return false;
+                    }
+                    result.insert(result.end(), chunk.begin(), chunk.end());
+                }
+                return true;
+            }
+
+            default: {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(113,
+                                        chars_read,
+                                        exception_message(input_format_t::cbor,
+                                                          concat("expected length specification (0x40-0x5B) or "
+                                                                 "indefinite binary array type (0x5F); last byte: 0x",
+                                                                 last_token),
+                                                          "binary"),
+                                        nullptr));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array or static_cast<std::size_t>(-1) for an
+                    array of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether array creation completed
+    */
+    bool
+    get_cbor_array(const std::size_t len, const cbor_tag_handler_t tag_handler) {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len))) {
+            return false;
+        }
+
+        if (len != static_cast<std::size_t>(-1)) {
+            for (std::size_t i = 0; i < len; ++i) {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler))) {
+                    return false;
+                }
+            }
+        } else {
+            while (get() != 0xFF) {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler))) {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object or static_cast<std::size_t>(-1) for an
+                    object of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether object creation completed
+    */
+    bool
+    get_cbor_object(const std::size_t len, const cbor_tag_handler_t tag_handler) {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len))) {
+            return false;
+        }
+
+        if (len != 0) {
+            string_t key;
+            if (len != static_cast<std::size_t>(-1)) {
+                for (std::size_t i = 0; i < len; ++i) {
+                    get();
+                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) {
+                        return false;
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler))) {
+                        return false;
+                    }
+                    key.clear();
+                }
+            } else {
+                while (get() != 0xFF) {
+                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) {
+                        return false;
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler))) {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool
+    parse_msgpack_internal() {
+        switch (get()) {
+            // EOF
+            case char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::msgpack, "value");
+
+            // positive fixint
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+            case 0x18:
+            case 0x19:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+            case 0x38:
+            case 0x39:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:
+            case 0x59:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:
+            case 0x79:
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            // fixmap
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(
+                    conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixarray
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+            case 0x98:
+            case 0x99:
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(
+                    conditional_static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            case 0xD9:  // str 8
+            case 0xDA:  // str 16
+            case 0xDB:  // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) && sax->string(s);
+            }
+
+            case 0xC0:  // nil
+                return sax->null();
+
+            case 0xC2:  // false
+                return sax->boolean(false);
+
+            case 0xC3:  // true
+                return sax->boolean(true);
+
+            case 0xC4:  // bin 8
+            case 0xC5:  // bin 16
+            case 0xC6:  // bin 32
+            case 0xC7:  // ext 8
+            case 0xC8:  // ext 16
+            case 0xC9:  // ext 32
+            case 0xD4:  // fixext 1
+            case 0xD5:  // fixext 2
+            case 0xD6:  // fixext 4
+            case 0xD7:  // fixext 8
+            case 0xD8:  // fixext 16
+            {
+                binary_t b;
+                return get_msgpack_binary(b) && sax->binary(b);
+            }
+
+            case 0xCA:  // float 32
+            {
+                float number{};
+                return get_number(input_format_t::msgpack, number) &&
+                       sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCB:  // float 64
+            {
+                double number{};
+                return get_number(input_format_t::msgpack, number) &&
+                       sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCC:  // uint 8
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCD:  // uint 16
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCE:  // uint 32
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCF:  // uint 64
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xD0:  // int 8
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD1:  // int 16
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD2:  // int 32
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD3:  // int 64
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xDC:  // array 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD:  // array 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_msgpack_array(conditional_static_cast<std::size_t>(len));
+            }
+
+            case 0xDE:  // map 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF:  // map 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_msgpack_object(conditional_static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<std::int8_t>(current));
+
+            default:  // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(
+                        112,
+                        chars_read,
+                        exception_message(input_format_t::msgpack, concat("invalid byte: 0x", last_token), "value"),
+                        nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool
+    get_msgpack_string(string_t& result) {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string"))) {
+            return false;
+        }
+
+        switch (current) {
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF: {
+                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0xD9:  // str 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDA:  // str 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDB:  // str 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            default: {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(
+                        113,
+                        chars_read,
+                        exception_message(
+                            input_format_t::msgpack,
+                            concat("expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x", last_token),
+                            "string"),
+                        nullptr));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into a byte array.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool
+    get_msgpack_binary(binary_t& result) {
+        // helper function to set the subtype
+        auto assign_and_return_true = [&result](std::int8_t subtype) {
+            result.set_subtype(static_cast<std::uint8_t>(subtype));
+            return true;
+        };
+
+        switch (current) {
+            case 0xC4:  // bin 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) && get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC5:  // bin 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC6:  // bin 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC7:  // ext 8
+            {
+                std::uint8_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) && get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) && assign_and_return_true(subtype);
+            }
+
+            case 0xC8:  // ext 16
+            {
+                std::uint16_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) && get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) && assign_and_return_true(subtype);
+            }
+
+            case 0xC9:  // ext 32
+            {
+                std::uint32_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) && get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) && assign_and_return_true(subtype);
+            }
+
+            case 0xD4:  // fixext 1
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) && get_binary(input_format_t::msgpack, 1, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD5:  // fixext 2
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) && get_binary(input_format_t::msgpack, 2, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD6:  // fixext 4
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) && get_binary(input_format_t::msgpack, 4, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD7:  // fixext 8
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) && get_binary(input_format_t::msgpack, 8, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD8:  // fixext 16
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 16, result) && assign_and_return_true(subtype);
+            }
+
+            default:           // LCOV_EXCL_LINE
+                return false;  // LCOV_EXCL_LINE
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool
+    get_msgpack_array(const std::size_t len) {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len))) {
+            return false;
+        }
+
+        for (std::size_t i = 0; i < len; ++i) {
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal())) {
+                return false;
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object
+    @return whether object creation completed
+    */
+    bool
+    get_msgpack_object(const std::size_t len) {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len))) {
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i) {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key))) {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal())) {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool
+    parse_ubjson_internal(const bool get_char = true) {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool
+    get_ubjson_string(string_t& result, const bool get_char = true) {
+        if (get_char) {
+            get();  // TODO(niels): may we ignore N here?
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value"))) {
+            return false;
+        }
+
+        switch (current) {
+            case 'U': {
+                std::uint8_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'i': {
+                std::int8_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'I': {
+                std::int16_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'l': {
+                std::int32_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'L': {
+                std::int64_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'u': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint16_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'm': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint32_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            case 'M': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint64_t len{};
+                return get_number(input_format, len) && get_string(input_format, len, result);
+            }
+
+            default:
+                break;
+        }
+        auto last_token = get_token_string();
+        std::string message;
+
+        if (input_format != input_format_t::bjdata) {
+            message = "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token;
+        } else {
+            message = "expected length type specification (U, i, u, I, m, l, M, L); last byte: 0x" + last_token;
+        }
+        return sax->parse_error(
+            chars_read,
+            last_token,
+            parse_error::create(113, chars_read, exception_message(input_format, message, "string"), nullptr));
+    }
+
+    /*!
+    @param[out] dim  an integer vector storing the ND array dimensions
+    @return whether reading ND array size vector is successful
+    */
+    bool
+    get_ubjson_ndarray_size(std::vector<size_t>& dim) {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        size_t dimlen   = 0;
+        bool no_ndarray = true;
+
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type, no_ndarray))) {
+            return false;
+        }
+
+        if (size_and_type.first != npos) {
+            if (size_and_type.second != 0) {
+                if (size_and_type.second != 'N') {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, size_and_type.second))) {
+                            return false;
+                        }
+                        dim.push_back(dimlen);
+                    }
+                }
+            } else {
+                for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray))) {
+                        return false;
+                    }
+                    dim.push_back(dimlen);
+                }
+            }
+        } else {
+            while (current != ']') {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, no_ndarray, current))) {
+                    return false;
+                }
+                dim.push_back(dimlen);
+                get_ignore_noop();
+            }
+        }
+        return true;
+    }
+
+    /*!
+    @param[out] result  determined size
+    @param[in,out] is_ndarray  for input, `true` means already inside an ndarray vector
+                               or ndarray dimension is not allowed; `false` means ndarray
+                               is allowed; for output, `true` means an ndarray is found;
+                               is_ndarray can only return `true` when its initial value
+                               is `false`
+    @param[in] prefix  type marker if already read, otherwise set to 0
+
+    @return whether size determination completed
+    */
+    bool
+    get_ubjson_size_value(std::size_t& result, bool& is_ndarray, char_int_type prefix = 0) {
+        if (prefix == 0) {
+            prefix = get_ignore_noop();
+        }
+
+        switch (prefix) {
+            case 'U': {
+                std::uint8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i': {
+                std::int8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                if (number < 0) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        parse_error::create(
+                            113,
+                            chars_read,
+                            exception_message(input_format, "count in an optimized container must be positive", "size"),
+                            nullptr));
+                }
+                result = static_cast<std::size_t>(
+                    number);  // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char
+                return true;
+            }
+
+            case 'I': {
+                std::int16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                if (number < 0) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        parse_error::create(
+                            113,
+                            chars_read,
+                            exception_message(input_format, "count in an optimized container must be positive", "size"),
+                            nullptr));
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l': {
+                std::int32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                if (number < 0) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        parse_error::create(
+                            113,
+                            chars_read,
+                            exception_message(input_format, "count in an optimized container must be positive", "size"),
+                            nullptr));
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L': {
+                std::int64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                if (number < 0) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        parse_error::create(
+                            113,
+                            chars_read,
+                            exception_message(input_format, "count in an optimized container must be positive", "size"),
+                            nullptr));
+                }
+                if (!value_in_range_of<std::size_t>(number)) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        out_of_range::create(
+                            408, exception_message(input_format, "integer value overflow", "size"), nullptr));
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'u': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'm': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                result = conditional_static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'M': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number))) {
+                    return false;
+                }
+                if (!value_in_range_of<std::size_t>(number)) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        out_of_range::create(
+                            408, exception_message(input_format, "integer value overflow", "size"), nullptr));
+                }
+                result = detail::conditional_static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case '[': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                if (is_ndarray)  // ndarray dimensional vector can only contain integers, and can not embed another
+                                 // array
+                {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        parse_error::create(
+                            113,
+                            chars_read,
+                            exception_message(input_format, "ndarray dimensional vector is not allowed", "size"),
+                            nullptr));
+                }
+                std::vector<size_t> dim;
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim))) {
+                    return false;
+                }
+                if (dim.size() == 1 ||
+                    (dim.size() == 2 && dim.at(0) == 1))  // return normal array size if 1D row vector
+                {
+                    result = dim.at(dim.size() - 1);
+                    return true;
+                }
+                if (!dim.empty())  // if ndarray, convert to an object in JData annotated array format
+                {
+                    for (auto i : dim)  // test if any dimension in an ndarray is 0, if so, return a 1D empty container
+                    {
+                        if (i == 0) {
+                            result = 0;
+                            return true;
+                        }
+                    }
+
+                    string_t key = "_ArraySize_";
+                    if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) ||
+                                             !sax->start_array(dim.size()))) {
+                        return false;
+                    }
+                    result = 1;
+                    for (auto i : dim) {
+                        result *= i;
+                        if (result == 0 || result == npos)  // because dim elements shall not have zeros, result = 0
+                                                            // means overflow happened; it also can't be npos as it is
+                                                            // used to initialize size in get_ubjson_size_type()
+                        {
+                            return sax->parse_error(
+                                chars_read,
+                                get_token_string(),
+                                out_of_range::create(
+                                    408,
+                                    exception_message(input_format, "excessive ndarray size caused overflow", "size"),
+                                    nullptr));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(static_cast<number_unsigned_t>(i)))) {
+                            return false;
+                        }
+                    }
+                    is_ndarray = true;
+                    return sax->end_array();
+                }
+                result = 0;
+                return true;
+            }
+
+            default:
+                break;
+        }
+        auto last_token = get_token_string();
+        std::string message;
+
+        if (input_format != input_format_t::bjdata) {
+            message = "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token;
+        } else {
+            message =
+                "expected length type specification (U, i, u, I, m, l, M, L) after '#'; last byte: 0x" + last_token;
+        }
+        return sax->parse_error(
+            chars_read,
+            last_token,
+            parse_error::create(113, chars_read, exception_message(input_format, message, "size"), nullptr));
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+    @param[in] inside_ndarray  whether the parser is parsing an ND array dimensional vector
+
+    @return whether pair creation completed
+    */
+    bool
+    get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result, bool inside_ndarray = false) {
+        result.first    = npos;  // size
+        result.second   = 0;     // type
+        bool is_ndarray = false;
+
+        get_ignore_noop();
+
+        if (current == '$') {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (input_format == input_format_t::bjdata &&
+                JSON_HEDLEY_UNLIKELY(std::binary_search(
+                    bjd_optimized_type_markers.begin(), bjd_optimized_type_markers.end(), result.second))) {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(
+                        112,
+                        chars_read,
+                        exception_message(input_format,
+                                          concat("marker 0x", last_token, " is not a permitted optimized array type"),
+                                          "type"),
+                        nullptr));
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "type"))) {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_HEDLEY_UNLIKELY(current != '#')) {
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value"))) {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(
+                        112,
+                        chars_read,
+                        exception_message(input_format,
+                                          concat("expected '#' after type information; last byte: 0x", last_token),
+                                          "size"),
+                        nullptr));
+            }
+
+            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
+            if (input_format == input_format_t::bjdata && is_ndarray) {
+                if (inside_ndarray) {
+                    return sax->parse_error(
+                        chars_read,
+                        get_token_string(),
+                        parse_error::create(112,
+                                            chars_read,
+                                            exception_message(input_format, "ndarray can not be recursive", "size"),
+                                            nullptr));
+                }
+                result.second |=
+                    (1 << 8);  // use bit 8 to indicate ndarray, all UBJSON and BJData markers should be ASCII letters
+            }
+            return is_error;
+        }
+
+        if (current == '#') {
+            const bool is_error = get_ubjson_size_value(result.first, is_ndarray);
+            if (input_format == input_format_t::bjdata && is_ndarray) {
+                return sax->parse_error(
+                    chars_read,
+                    get_token_string(),
+                    parse_error::create(112,
+                                        chars_read,
+                                        exception_message(input_format, "ndarray requires both type and size", "size"),
+                                        nullptr));
+            }
+            return is_error;
+        }
+
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool
+    get_ubjson_value(const char_int_type prefix) {
+        switch (prefix) {
+            case char_traits<char_type>::eof():  // EOF
+                return unexpect_eof(input_format, "value");
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'U': {
+                std::uint8_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'i': {
+                std::int8_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'I': {
+                std::int16_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'l': {
+                std::int32_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'L': {
+                std::int64_t number{};
+                return get_number(input_format, number) && sax->number_integer(number);
+            }
+
+            case 'u': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint16_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'm': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint32_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'M': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                std::uint64_t number{};
+                return get_number(input_format, number) && sax->number_unsigned(number);
+            }
+
+            case 'h': {
+                if (input_format != input_format_t::bjdata) {
+                    break;
+                }
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number"))) {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number"))) {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half  = static_cast<unsigned int>((byte2 << 8u) + byte1);
+                const double val = [&half] {
+                    const int exp           = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp && exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp) {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0) ? std::numeric_limits<double>::infinity()
+                                               : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float(
+                    (half & 0x8000u) != 0 ? static_cast<number_float_t>(-val) : static_cast<number_float_t>(val), "");
+            }
+
+            case 'd': {
+                float number{};
+                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D': {
+                double number{};
+                return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'H': {
+                return get_ubjson_high_precision_number();
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "char"))) {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(current > 127)) {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(
+                        chars_read,
+                        last_token,
+                        parse_error::create(
+                            113,
+                            chars_read,
+                            exception_message(
+                                input_format,
+                                concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token),
+                                "char"),
+                            nullptr));
+                }
+                string_t s(1, static_cast<typename string_t::value_type>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) && sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default:  // anything else
+                break;
+        }
+        auto last_token = get_token_string();
+        return sax->parse_error(
+            chars_read,
+            last_token,
+            parse_error::create(
+                112, chars_read, exception_message(input_format, "invalid byte: 0x" + last_token, "value"), nullptr));
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool
+    get_ubjson_array() {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) {
+            return false;
+        }
+
+        // if bit-8 of size_and_type.second is set to 1, encode bjdata ndarray as an object in JData annotated array
+        // format (https://github.com/NeuroJSON/jdata):
+        // {"_ArrayType_" : "typeid", "_ArraySize_" : [n1, n2, ...], "_ArrayData_" : [v1, v2, ...]}
+
+        if (input_format == input_format_t::bjdata && size_and_type.first != npos &&
+            (size_and_type.second & (1 << 8)) != 0) {
+            size_and_type.second &=
+                ~(static_cast<char_int_type>(1)
+                  << 8);  // use bit 8 to indicate ndarray, here we remove the bit to restore the type marker
+            auto it      = std::lower_bound(bjd_types_map.begin(),
+                                       bjd_types_map.end(),
+                                       size_and_type.second,
+                                       [](const bjd_type& p, char_int_type t) { return p.first < t; });
+            string_t key = "_ArrayType_";
+            if (JSON_HEDLEY_UNLIKELY(it == bjd_types_map.end() || it->first != size_and_type.second)) {
+                auto last_token = get_token_string();
+                return sax->parse_error(
+                    chars_read,
+                    last_token,
+                    parse_error::create(112,
+                                        chars_read,
+                                        exception_message(input_format, "invalid byte: 0x" + last_token, "type"),
+                                        nullptr));
+            }
+
+            string_t type = it->second;  // sax->string() takes a reference
+            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type))) {
+                return false;
+            }
+
+            if (size_and_type.second == 'C') {
+                size_and_type.second = 'U';
+            }
+
+            key = "_ArrayData_";
+            if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first))) {
+                return false;
+            }
+
+            for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) {
+                    return false;
+                }
+            }
+
+            return (sax->end_array() && sax->end_object());
+        }
+
+        if (size_and_type.first != npos) {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first))) {
+                return false;
+            }
+
+            if (size_and_type.second != 0) {
+                if (size_and_type.second != 'N') {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) {
+                            return false;
+                        }
+                    }
+                }
+            } else {
+                for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) {
+                        return false;
+                    }
+                }
+            }
+        } else {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1)))) {
+                return false;
+            }
+
+            while (current != ']') {
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false))) {
+                    return false;
+                }
+                get_ignore_noop();
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool
+    get_ubjson_object() {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) {
+            return false;
+        }
+
+        // do not accept ND-array size in objects in BJData
+        if (input_format == input_format_t::bjdata && size_and_type.first != npos &&
+            (size_and_type.second & (1 << 8)) != 0) {
+            auto last_token = get_token_string();
+            return sax->parse_error(
+                chars_read,
+                last_token,
+                parse_error::create(
+                    112,
+                    chars_read,
+                    exception_message(
+                        input_format, "BJData object does not support ND-array size in optimized format", "object"),
+                    nullptr));
+        }
+
+        string_t key;
+        if (size_and_type.first != npos) {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first))) {
+                return false;
+            }
+
+            if (size_and_type.second != 0) {
+                for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) {
+                        return false;
+                    }
+                    key.clear();
+                }
+            } else {
+                for (std::size_t i = 0; i < size_and_type.first; ++i) {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        } else {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1)))) {
+                return false;
+            }
+
+            while (current != '}') {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key))) {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    // Note, no reader for UBJSON binary types is implemented because they do
+    // not exist
+
+    bool
+    get_ubjson_high_precision_number() {
+        // get size of following number string
+        std::size_t size{};
+        bool no_ndarray = true;
+        auto res        = get_ubjson_size_value(size, no_ndarray);
+        if (JSON_HEDLEY_UNLIKELY(!res)) {
+            return res;
+        }
+
+        // get number string
+        std::vector<char> number_vector;
+        for (std::size_t i = 0; i < size; ++i) {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number"))) {
+                return false;
+            }
+            number_vector.push_back(static_cast<char>(current));
+        }
+
+        // parse number string
+        using ia_type            = decltype(detail::input_adapter(number_vector));
+        auto number_lexer        = detail::lexer<BasicJsonType, ia_type>(detail::input_adapter(number_vector), false);
+        const auto result_number = number_lexer.scan();
+        const auto number_string = number_lexer.get_token_string();
+        const auto result_remainder = number_lexer.scan();
+
+        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;
+
+        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) {
+            return sax->parse_error(
+                chars_read,
+                number_string,
+                parse_error::create(115,
+                                    chars_read,
+                                    exception_message(input_format,
+                                                      concat("invalid number text: ", number_lexer.get_token_string()),
+                                                      "high-precision number"),
+                                    nullptr));
+        }
+
+        switch (result_number) {
+            case token_type::value_integer:
+                return sax->number_integer(number_lexer.get_number_integer());
+            case token_type::value_unsigned:
+                return sax->number_unsigned(number_lexer.get_number_unsigned());
+            case token_type::value_float:
+                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
+            case token_type::uninitialized:
+            case token_type::literal_true:
+            case token_type::literal_false:
+            case token_type::literal_null:
+            case token_type::value_string:
+            case token_type::begin_array:
+            case token_type::begin_object:
+            case token_type::end_array:
+            case token_type::end_object:
+            case token_type::name_separator:
+            case token_type::value_separator:
+            case token_type::parse_error:
+            case token_type::end_of_input:
+            case token_type::literal_or_value:
+            default:
+                return sax->parse_error(chars_read,
+                                        number_string,
+                                        parse_error::create(115,
+                                                            chars_read,
+                                                            exception_message(input_format,
+                                                                              concat("invalid number text: ",
+                                                                                     number_lexer.get_token_string()),
+                                                                              "high-precision number"),
+                                                            nullptr));
+        }
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*!
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a -'ve valued
+    `char_traits<char_type>::eof()` in that case.
+
+    @return character read from the input
+    */
+    char_int_type
+    get() {
+        ++chars_read;
+        return current = ia.get_character();
+    }
+
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    char_int_type
+    get_ignore_noop() {
+        do {
+            get();
+        } while (current == 'N');
+
+        return current;
+    }
+
+    /*
+    @brief read a number from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format   the current format (for diagnostics)
+    @param[out] result  number of type @a NumberType
+
+    @return whether conversion completed
+
+    @note This function needs to respect the system's endianness, because
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
+          On the other hand, BSON and BJData use little endian and should reorder
+          on big endian systems.
+    */
+    template <typename NumberType, bool InputIsLittleEndian = false>
+    bool
+    get_number(const input_format_t format, NumberType& result) {
+        // step 1: read input into array with system's byte order
+        std::array<std::uint8_t, sizeof(NumberType)> vec{};
+        for (std::size_t i = 0; i < sizeof(NumberType); ++i) {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number"))) {
+                return false;
+            }
+
+            // reverse byte order prior to conversion if necessary
+            if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata)) {
+                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
+            } else {
+                vec[i] = static_cast<std::uint8_t>(current);  // LCOV_EXCL_LINE
+            }
+        }
+
+        // step 2: convert array into number of type T and return
+        std::memcpy(&result, vec.data(), sizeof(NumberType));
+        return true;
+    }
+
+    /*!
+    @brief create a string by reading characters from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of characters to read
+    @param[out] result string created by reading @a len bytes
+
+    @return whether string creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of string memory.
+    */
+    template <typename NumberType>
+    bool
+    get_string(const input_format_t format, const NumberType len, string_t& result) {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++) {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string"))) {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<typename string_t::value_type>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @brief create a byte array by reading bytes from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of bytes to read
+    @param[out] result byte array created by reading @a len bytes
+
+    @return whether byte array creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of memory.
+    */
+    template <typename NumberType>
+    bool
+    get_binary(const input_format_t format, const NumberType len, binary_t& result) {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++) {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary"))) {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<std::uint8_t>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @param[in] format   the current format (for diagnostics)
+    @param[in] context  further context information (for diagnostics)
+    @return whether the last read character is not EOF
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool
+    unexpect_eof(const input_format_t format, const char* context) const {
+        if (JSON_HEDLEY_UNLIKELY(current == char_traits<char_type>::eof())) {
+            return sax->parse_error(
+                chars_read,
+                "<end of file>",
+                parse_error::create(
+                    110, chars_read, exception_message(format, "unexpected end of input", context), nullptr));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string
+    get_token_string() const {
+        std::array<char, 3> cr{{}};
+        static_cast<void>((std::snprintf)(
+            cr.data(),
+            cr.size(),
+            "%.2hhX",
+            static_cast<unsigned char>(current)));  // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        return std::string{cr.data()};
+    }
+
+    /*!
+    @param[in] format   the current format
+    @param[in] detail   a detailed error message
+    @param[in] context  further context information
+    @return a message string to use in the parse_error exceptions
+    */
+    std::string
+    exception_message(const input_format_t format, const std::string& detail, const std::string& context) const {
+        std::string error_msg = "syntax error while parsing ";
+
+        switch (format) {
+            case input_format_t::cbor:
+                error_msg += "CBOR";
+                break;
+
+            case input_format_t::msgpack:
+                error_msg += "MessagePack";
+                break;
+
+            case input_format_t::ubjson:
+                error_msg += "UBJSON";
+                break;
+
+            case input_format_t::bson:
+                error_msg += "BSON";
+                break;
+
+            case input_format_t::bjdata:
+                error_msg += "BJData";
+                break;
+
+            case input_format_t::json:  // LCOV_EXCL_LINE
+            default:                    // LCOV_EXCL_LINE
+                JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+        return concat(error_msg, ' ', context, ": ", detail);
+    }
+
+   private:
+    static JSON_INLINE_VARIABLE constexpr std::size_t npos = static_cast<std::size_t>(-1);
+
+    /// input adapter
+    InputAdapterType ia;
+
+    /// the current character
+    char_int_type current = char_traits<char_type>::eof();
+
+    /// the number of characters read
+    std::size_t chars_read = 0;
+
+    /// whether we can assume little endianness
+    const bool is_little_endian = little_endianness();
+
+    /// input format
+    const input_format_t input_format = input_format_t::json;
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
+
+    // excluded markers in bjdata optimized type
+#define JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_ \
+    make_array<char_int_type>('F', 'H', 'N', 'S', 'T', 'Z', '[', '{')
+
+#define JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_    \
+    make_array<bjd_type>(bjd_type{'C', "char"},   \
+                         bjd_type{'D', "double"}, \
+                         bjd_type{'I', "int16"},  \
+                         bjd_type{'L', "int64"},  \
+                         bjd_type{'M', "uint64"}, \
+                         bjd_type{'U', "uint8"},  \
+                         bjd_type{'d', "single"}, \
+                         bjd_type{'i', "int8"},   \
+                         bjd_type{'l', "int32"},  \
+                         bjd_type{'m', "uint32"}, \
+                         bjd_type{'u', "uint16"})
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        // lookup tables
+        // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+        const decltype(JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_) bjd_optimized_type_markers =
+            JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_;
+
+    using bjd_type = std::pair<char_int_type, string_t>;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    const decltype(JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_) bjd_types_map = JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_;
+
+#undef JSON_BINARY_READER_MAKE_BJD_OPTIMIZED_TYPE_MARKERS_
+#undef JSON_BINARY_READER_MAKE_BJD_TYPES_MAP_
+};
+
+#ifndef JSON_HAS_CPP_17
+template <typename BasicJsonType, typename InputAdapterType, typename SAX>
+constexpr std::size_t binary_reader<BasicJsonType, InputAdapterType, SAX>::npos;
+#endif
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/input/parser.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cmath>       // isfinite
+#include <cstdint>     // uint8_t
+#include <functional>  // function
+#include <string>      // string
+#include <utility>     // move
+#include <vector>      // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+////////////
+// parser //
+////////////
+
+enum class parse_event_t : std::uint8_t {
+    /// the parser read `{` and started to process a JSON object
+    object_start,
+    /// the parser read `}` and finished processing a JSON object
+    object_end,
+    /// the parser read `[` and started to process a JSON array
+    array_start,
+    /// the parser read `]` and finished processing a JSON array
+    array_end,
+    /// the parser read a key of a value in an object
+    key,
+    /// the parser finished reading a JSON value
+    value
+};
+
+template <typename BasicJsonType>
+using parser_callback_t = std::function<bool(int /*depth*/, parse_event_t /*event*/, BasicJsonType& /*parsed*/)>;
+
+/*!
+@brief syntax analysis
+
+This class implements a recursive descent parser.
+*/
+template <typename BasicJsonType, typename InputAdapterType>
+class parser {
+    using number_integer_t  = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t    = typename BasicJsonType::number_float_t;
+    using string_t          = typename BasicJsonType::string_t;
+    using lexer_t           = lexer<BasicJsonType, InputAdapterType>;
+    using token_type        = typename lexer_t::token_type;
+
+   public:
+    /// a parser reading from an input adapter
+    explicit parser(InputAdapterType&& adapter,
+                    const parser_callback_t<BasicJsonType> cb = nullptr,
+                    const bool allow_exceptions_              = true,
+                    const bool skip_comments                  = false)
+        : callback(cb), m_lexer(std::move(adapter), skip_comments), allow_exceptions(allow_exceptions_) {
+        // read first token
+        get_token();
+    }
+
+    /*!
+    @brief public parser interface
+
+    @param[in] strict      whether to expect the last token to be EOF
+    @param[in,out] result  parsed JSON value
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    void
+    parse(const bool strict, BasicJsonType& result) {
+        if (callback) {
+            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            sax_parse_internal(&sdp);
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input)) {
+                sdp.parse_error(
+                    m_lexer.get_position(),
+                    m_lexer.get_token_string(),
+                    parse_error::create(
+                        101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored()) {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded()) {
+                result = nullptr;
+            }
+        } else {
+            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            sax_parse_internal(&sdp);
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input)) {
+                sdp.parse_error(
+                    m_lexer.get_position(),
+                    m_lexer.get_token_string(),
+                    parse_error::create(
+                        101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored()) {
+                result = value_t::discarded;
+                return;
+            }
+        }
+
+        result.assert_invariant();
+    }
+
+    /*!
+    @brief public accept interface
+
+    @param[in] strict  whether to expect the last token to be EOF
+    @return whether the input is a proper JSON text
+    */
+    bool
+    accept(const bool strict = true) {
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
+
+    template <typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse(SAX* sax, const bool strict = true) {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType>{};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result && strict && (get_token() != token_type::end_of_input)) {
+            return sax->parse_error(
+                m_lexer.get_position(),
+                m_lexer.get_token_string(),
+                parse_error::create(
+                    101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), nullptr));
+        }
+
+        return result;
+    }
+
+   private:
+    template <typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse_internal(SAX* sax) {
+        // stack to remember the hierarchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
+
+        while (true) {
+            if (!skip_to_state_evaluation) {
+                // invariant: get_token() was called before each iteration
+                switch (last_token) {
+                    case token_type::begin_object: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast<std::size_t>(-1)))) {
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object) {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string)) {
+                            return sax->parse_error(
+                                m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101,
+                                                    m_lexer.get_position(),
+                                                    exception_message(token_type::value_string, "object key"),
+                                                    nullptr));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string()))) {
+                            return false;
+                        }
+
+                        // parse separator (:)
+                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) {
+                            return sax->parse_error(
+                                m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101,
+                                                    m_lexer.get_position(),
+                                                    exception_message(token_type::name_separator, "object separator"),
+                                                    nullptr));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast<std::size_t>(-1)))) {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array) {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float: {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res))) {
+                            return sax->parse_error(
+                                m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                out_of_range::create(
+                                    406,
+                                    concat("number overflow parsing '", m_lexer.get_token_string(), '\''),
+                                    nullptr));
+                        }
+
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string()))) {
+                            return false;
+                        }
+
+                        break;
+                    }
+
+                    case token_type::literal_false: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false))) {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_null: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->null())) {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_true: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true))) {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_integer: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer()))) {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_string: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string()))) {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_unsigned: {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned()))) {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::parse_error: {
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(
+                            m_lexer.get_position(),
+                            m_lexer.get_token_string(),
+                            parse_error::create(101,
+                                                m_lexer.get_position(),
+                                                exception_message(token_type::uninitialized, "value"),
+                                                nullptr));
+                    }
+                    case token_type::end_of_input: {
+                        if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1)) {
+                            return sax->parse_error(
+                                m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101,
+                                                    m_lexer.get_position(),
+                                                    "attempting to parse an empty input; check that your input string "
+                                                    "or stream contains the expected JSON",
+                                                    nullptr));
+                        }
+
+                        return sax->parse_error(
+                            m_lexer.get_position(),
+                            m_lexer.get_token_string(),
+                            parse_error::create(101,
+                                                m_lexer.get_position(),
+                                                exception_message(token_type::literal_or_value, "value"),
+                                                nullptr));
+                    }
+                    case token_type::uninitialized:
+                    case token_type::end_array:
+                    case token_type::end_object:
+                    case token_type::name_separator:
+                    case token_type::value_separator:
+                    case token_type::literal_or_value:
+                    default:  // the last token was unexpected
+                    {
+                        return sax->parse_error(
+                            m_lexer.get_position(),
+                            m_lexer.get_token_string(),
+                            parse_error::create(101,
+                                                m_lexer.get_position(),
+                                                exception_message(token_type::literal_or_value, "value"),
+                                                nullptr));
+                    }
+                }
+            } else {
+                skip_to_state_evaluation = false;
+            }
+
+            // we reached this line after we successfully parsed a value
+            if (states.empty()) {
+                // empty stack: we reached the end of the hierarchy: done
+                return true;
+            }
+
+            if (states.back())  // array
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator) {
+                    // parse a new value
+                    get_token();
+                    continue;
+                }
+
+                // closing ]
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array)) {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) {
+                        return false;
+                    }
+
+                    // We are done with this array. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(
+                    m_lexer.get_position(),
+                    m_lexer.get_token_string(),
+                    parse_error::create(
+                        101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), nullptr));
+            }
+
+            // states.back() is false -> object
+
+            // comma -> next value
+            if (get_token() == token_type::value_separator) {
+                // parse key
+                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) {
+                    return sax->parse_error(
+                        m_lexer.get_position(),
+                        m_lexer.get_token_string(),
+                        parse_error::create(101,
+                                            m_lexer.get_position(),
+                                            exception_message(token_type::value_string, "object key"),
+                                            nullptr));
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string()))) {
+                    return false;
+                }
+
+                // parse separator (:)
+                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) {
+                    return sax->parse_error(
+                        m_lexer.get_position(),
+                        m_lexer.get_token_string(),
+                        parse_error::create(101,
+                                            m_lexer.get_position(),
+                                            exception_message(token_type::name_separator, "object separator"),
+                                            nullptr));
+                }
+
+                // parse values
+                get_token();
+                continue;
+            }
+
+            // closing }
+            if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object)) {
+                if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) {
+                    return false;
+                }
+
+                // We are done with this object. Before we can parse a
+                // new value, we need to evaluate the new state first.
+                // By setting skip_to_state_evaluation to false, we
+                // are effectively jumping to the beginning of this if.
+                JSON_ASSERT(!states.empty());
+                states.pop_back();
+                skip_to_state_evaluation = true;
+                continue;
+            }
+
+            return sax->parse_error(
+                m_lexer.get_position(),
+                m_lexer.get_token_string(),
+                parse_error::create(
+                    101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), nullptr));
+        }
+    }
+
+    /// get next token from lexer
+    token_type
+    get_token() {
+        return last_token = m_lexer.scan();
+    }
+
+    std::string
+    exception_message(const token_type expected, const std::string& context) {
+        std::string error_msg = "syntax error ";
+
+        if (!context.empty()) {
+            error_msg += concat("while parsing ", context, ' ');
+        }
+
+        error_msg += "- ";
+
+        if (last_token == token_type::parse_error) {
+            error_msg += concat(m_lexer.get_error_message(), "; last read: '", m_lexer.get_token_string(), '\'');
+        } else {
+            error_msg += concat("unexpected ", lexer_t::token_type_name(last_token));
+        }
+
+        if (expected != token_type::uninitialized) {
+            error_msg += concat("; expected ", lexer_t::token_type_name(expected));
+        }
+
+        return error_msg;
+    }
+
+   private:
+    /// callback function
+    const parser_callback_t<BasicJsonType> callback = nullptr;
+    /// the type of the last read token
+    token_type last_token = token_type::uninitialized;
+    /// the lexer
+    lexer_t m_lexer;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstddef>  // ptrdiff_t
+#include <limits>   // numeric_limits
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/*
+@brief an iterator for primitive JSON types
+
+This class models an iterator for primitive JSON types (boolean, number,
+string). It's only purpose is to allow the iterator/const_iterator classes
+to "iterate" over primitive values. Internally, the iterator is modeled by
+a `difference_type` variable. Value begin_value (`0`) models the begin,
+end_value (`1`) models past the end.
+*/
+class primitive_iterator_t {
+   private:
+    using difference_type                        = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value   = begin_value + 1;
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        /// iterator as signed integer type
+        difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+   public:
+    constexpr difference_type
+    get_value() const noexcept {
+        return m_it;
+    }
+
+    /// set iterator to a defined beginning
+    void
+    set_begin() noexcept {
+        m_it = begin_value;
+    }
+
+    /// set iterator to a defined past the end
+    void
+    set_end() noexcept {
+        m_it = end_value;
+    }
+
+    /// return whether the iterator can be dereferenced
+    constexpr bool
+    is_begin() const noexcept {
+        return m_it == begin_value;
+    }
+
+    /// return whether the iterator is at end
+    constexpr bool
+    is_end() const noexcept {
+        return m_it == end_value;
+    }
+
+    friend constexpr bool
+    operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept {
+        return lhs.m_it == rhs.m_it;
+    }
+
+    friend constexpr bool
+    operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept {
+        return lhs.m_it < rhs.m_it;
+    }
+
+    primitive_iterator_t
+    operator+(difference_type n) noexcept {
+        auto result = *this;
+        result += n;
+        return result;
+    }
+
+    friend constexpr difference_type
+    operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept {
+        return lhs.m_it - rhs.m_it;
+    }
+
+    primitive_iterator_t&
+    operator++() noexcept {
+        ++m_it;
+        return *this;
+    }
+
+    primitive_iterator_t
+    operator++(int) & noexcept  // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        ++m_it;
+        return result;
+    }
+
+    primitive_iterator_t&
+    operator--() noexcept {
+        --m_it;
+        return *this;
+    }
+
+    primitive_iterator_t
+    operator--(int) & noexcept  // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        --m_it;
+        return result;
+    }
+
+    primitive_iterator_t&
+    operator+=(difference_type n) noexcept {
+        m_it += n;
+        return *this;
+    }
+
+    primitive_iterator_t&
+    operator-=(difference_type n) noexcept {
+        m_it -= n;
+        return *this;
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/*!
+@brief an iterator value
+
+@note This structure could easily be a union, but MSVC currently does not allow
+unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
+*/
+template <typename BasicJsonType>
+struct internal_iterator {
+    /// iterator for JSON objects
+    typename BasicJsonType::object_t::iterator object_iterator{};
+    /// iterator for JSON arrays
+    typename BasicJsonType::array_t::iterator array_iterator{};
+    /// generic iterator for all other types
+    primitive_iterator_t primitive_iterator{};
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <iterator>     // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits>  // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+// forward declare, to be able to friend it later on
+template <typename IteratorType>
+class iteration_proxy;
+template <typename IteratorType>
+class iteration_proxy_value;
+
+/*!
+@brief a template for a bidirectional iterator for the @ref basic_json class
+This class implements a both iterators (iterator and const_iterator) for the
+@ref basic_json class.
+@note An iterator is called *initialized* when a pointer to a JSON value has
+      been set (e.g., by a constructor or a copy assignment). If the iterator is
+      default-constructed, it is *uninitialized* and most methods are undefined.
+      **The library uses assertions to detect calls on uninitialized iterators.**
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
+*/
+template <typename BasicJsonType>
+class iter_impl  // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
+{
+    /// the iterator with BasicJsonType of different const-ness
+    using other_iter_impl = iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value,
+                                                                typename std::remove_const<BasicJsonType>::type,
+                                                                const BasicJsonType>::type>;
+    /// allow basic_json to access private members
+    friend other_iter_impl;
+    friend BasicJsonType;
+    friend iteration_proxy<iter_impl>;
+    friend iteration_proxy_value<iter_impl>;
+
+    using object_t = typename BasicJsonType::object_t;
+    using array_t  = typename BasicJsonType::array_t;
+    // make sure BasicJsonType is basic_json or const basic_json
+    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
+                  "iter_impl only accepts (const) basic_json");
+    // superficial check for the LegacyBidirectionalIterator named requirement
+    static_assert(
+        std::is_base_of<std::bidirectional_iterator_tag, std::bidirectional_iterator_tag>::value &&
+            std::is_base_of<std::bidirectional_iterator_tag,
+                            typename std::iterator_traits<typename array_t::iterator>::iterator_category>::value,
+        "basic_json iterator assumes array and object type iterators satisfy the LegacyBidirectionalIterator named "
+        "requirement.");
+
+   public:
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    /// the type of the values when the iterator is dereferenced
+    using value_type = typename BasicJsonType::value_type;
+    /// a type to represent differences between iterators
+    using difference_type = typename BasicJsonType::difference_type;
+    /// defines a pointer to the type iterated over (value_type)
+    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
+                                              typename BasicJsonType::const_pointer,
+                                              typename BasicJsonType::pointer>::type;
+    /// defines a reference to the type iterated over (value_type)
+    using reference = typename std::conditional<std::is_const<BasicJsonType>::value,
+                                                typename BasicJsonType::const_reference,
+                                                typename BasicJsonType::reference>::type;
+
+    iter_impl()                     = default;
+    ~iter_impl()                    = default;
+    iter_impl(iter_impl&&) noexcept = default;
+    iter_impl&
+    operator=(iter_impl&&) noexcept = default;
+
+    /*!
+    @brief constructor for a given JSON instance
+    @param[in] object  pointer to a JSON object for this iterator
+    @pre object != nullptr
+    @post The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    explicit iter_impl(pointer object) noexcept : m_object(object) {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                m_it.object_iterator = typename object_t::iterator();
+                break;
+            }
+
+            case value_t::array: {
+                m_it.array_iterator = typename array_t::iterator();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                m_it.primitive_iterator = primitive_iterator_t();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @note The conventional copy constructor and copy assignment are implicitly
+          defined. Combined with the following converting constructor and
+          assignment, they support: (1) copy from iterator to iterator, (2)
+          copy from const iterator to const iterator, and (3) conversion from
+          iterator to const iterator. However conversion from const iterator
+          to iterator is not defined.
+    */
+
+    /*!
+    @brief const copy constructor
+    @param[in] other const iterator to copy from
+    @note This copy constructor had to be defined explicitly to circumvent a bug
+          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
+          information refer to: https://github.com/nlohmann/json/issues/1608
+    */
+    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept : m_object(other.m_object), m_it(other.m_it) {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl&
+    operator=(const iter_impl<const BasicJsonType>& other) noexcept {
+        if (&other != this) {
+            m_object = other.m_object;
+            m_it     = other.m_it;
+        }
+        return *this;
+    }
+
+    /*!
+    @brief converting constructor
+    @param[in] other  non-const iterator to copy from
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it) {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other  non-const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl&
+    operator=(
+        const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept  // NOLINT(cert-oop54-cpp)
+    {
+        m_object = other.m_object;
+        m_it     = other.m_it;
+        return *this;
+    }
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        /*!
+        @brief set the iterator to the first value
+        @pre The iterator is initialized; i.e. `m_object != nullptr`.
+        */
+        void
+        set_begin() noexcept {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                m_it.object_iterator = m_object->m_data.m_value.object->begin();
+                break;
+            }
+
+            case value_t::array: {
+                m_it.array_iterator = m_object->m_data.m_value.array->begin();
+                break;
+            }
+
+            case value_t::null: {
+                // set to end so begin()==end() is true: null is empty
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                m_it.primitive_iterator.set_begin();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief set the iterator past the last value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void
+    set_end() noexcept {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                m_it.object_iterator = m_object->m_data.m_value.object->end();
+                break;
+            }
+
+            case value_t::array: {
+                m_it.array_iterator = m_object->m_data.m_value.array->end();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+        }
+    }
+
+   public:
+    /*!
+    @brief return a reference to the value pointed to by the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference
+    operator*() const {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
+                return m_it.object_iterator->second;
+            }
+
+            case value_t::array: {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
+                return *m_it.array_iterator;
+            }
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin())) {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief dereference the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    pointer
+    operator->() const {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end());
+                return &(m_it.object_iterator->second);
+            }
+
+            case value_t::array: {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end());
+                return &*m_it.array_iterator;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin())) {
+                    return m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief post-increment (it++)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl
+    operator++(int) &  // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        ++(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-increment (++it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl&
+    operator++() {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                std::advance(m_it.object_iterator, 1);
+                break;
+            }
+
+            case value_t::array: {
+                std::advance(m_it.array_iterator, 1);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                ++m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief post-decrement (it--)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl
+    operator--(int) &  // NOLINT(cert-dcl21-cpp)
+    {
+        auto result = *this;
+        --(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-decrement (--it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl&
+    operator--() {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object: {
+                std::advance(m_it.object_iterator, -1);
+                break;
+            }
+
+            case value_t::array: {
+                std::advance(m_it.array_iterator, -1);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                --m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief comparison: equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    template <typename IterImpl,
+              detail::enable_if_t<(std::is_same<IterImpl, iter_impl>::value ||
+                                   std::is_same<IterImpl, other_iter_impl>::value),
+                                  std::nullptr_t> = nullptr>
+    bool
+    operator==(const IterImpl& other) const {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object)) {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object:
+                return (m_it.object_iterator == other.m_it.object_iterator);
+
+            case value_t::array:
+                return (m_it.array_iterator == other.m_it.array_iterator);
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief comparison: not equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    template <typename IterImpl,
+              detail::enable_if_t<(std::is_same<IterImpl, iter_impl>::value ||
+                                   std::is_same<IterImpl, other_iter_impl>::value),
+                                  std::nullptr_t> = nullptr>
+    bool
+    operator!=(const IterImpl& other) const {
+        return !operator==(other);
+    }
+
+    /*!
+    @brief comparison: smaller
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool
+    operator<(const iter_impl& other) const {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object)) {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", m_object));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object));
+
+            case value_t::array:
+                return (m_it.array_iterator < other.m_it.array_iterator);
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool
+    operator<=(const iter_impl& other) const {
+        return !other.operator<(*this);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool
+    operator>(const iter_impl& other) const {
+        return !operator<=(other);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool
+    operator>=(const iter_impl& other) const {
+        return !operator<(other);
+    }
+
+    /*!
+    @brief add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl&
+    operator+=(difference_type i) {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
+
+            case value_t::array: {
+                std::advance(m_it.array_iterator, i);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                m_it.primitive_iterator += i;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl&
+    operator-=(difference_type i) {
+        return operator+=(-i);
+    }
+
+    /*!
+    @brief add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl
+    operator+(difference_type i) const {
+        auto result = *this;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief addition of distance and iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    friend iter_impl
+    operator+(difference_type i, const iter_impl& it) {
+        auto result = it;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl
+    operator-(difference_type i) const {
+        auto result = *this;
+        result -= i;
+        return result;
+    }
+
+    /*!
+    @brief return difference
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    difference_type
+    operator-(const iter_impl& other) const {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object));
+
+            case value_t::array:
+                return m_it.array_iterator - other.m_it.array_iterator;
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return m_it.primitive_iterator - other.m_it.primitive_iterator;
+        }
+    }
+
+    /*!
+    @brief access to successor
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference
+    operator[](difference_type n) const {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_data.m_type) {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object));
+
+            case value_t::array:
+                return *std::next(m_it.array_iterator, n);
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n)) {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief return the key of an object iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    const typename object_t::key_type&
+    key() const {
+        JSON_ASSERT(m_object != nullptr);
+
+        if (JSON_HEDLEY_LIKELY(m_object->is_object())) {
+            return m_it.object_iterator->first;
+        }
+
+        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", m_object));
+    }
+
+    /*!
+    @brief return the value of an iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference
+    value() const {
+        return operator*();
+    }
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        /// associated JSON instance
+        pointer m_object = nullptr;
+    /// the actual iterator of the associated instance
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it{};
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <cstddef>   // ptrdiff_t
+#include <iterator>  // reverse_iterator
+#include <utility>   // declval
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+//////////////////////
+// reverse_iterator //
+//////////////////////
+
+/*!
+@brief a template for a reverse iterator class
+
+@tparam Base the base iterator type to reverse. Valid types are @ref
+iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+create @ref const_reverse_iterator).
+
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
+  It is possible to write to the pointed-to element (only if @a Base is
+  @ref iterator).
+
+@since version 1.0.0
+*/
+template <typename Base>
+class json_reverse_iterator : public std::reverse_iterator<Base> {
+   public:
+    using difference_type = std::ptrdiff_t;
+    /// shortcut to the reverse iterator adapter
+    using base_iterator = std::reverse_iterator<Base>;
+    /// the reference type for the pointed-to element
+    using reference = typename Base::reference;
+
+    /// create reverse iterator from iterator
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept : base_iterator(it) {}
+
+    /// create reverse iterator from base class
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+
+    /// post-increment (it++)
+    json_reverse_iterator
+    operator++(int) &  // NOLINT(cert-dcl21-cpp)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
+    }
+
+    /// pre-increment (++it)
+    json_reverse_iterator&
+    operator++() {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
+    }
+
+    /// post-decrement (it--)
+    json_reverse_iterator
+    operator--(int) &  // NOLINT(cert-dcl21-cpp)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
+    }
+
+    /// pre-decrement (--it)
+    json_reverse_iterator&
+    operator--() {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
+    }
+
+    /// add to iterator
+    json_reverse_iterator&
+    operator+=(difference_type i) {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
+    }
+
+    /// add to iterator
+    json_reverse_iterator
+    operator+(difference_type i) const {
+        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
+    }
+
+    /// subtract from iterator
+    json_reverse_iterator
+    operator-(difference_type i) const {
+        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
+    }
+
+    /// return difference
+    difference_type
+    operator-(const json_reverse_iterator& other) const {
+        return base_iterator(*this) - base_iterator(other);
+    }
+
+    /// access to successor
+    reference
+    operator[](difference_type n) const {
+        return *(this->operator+(n));
+    }
+
+    /// return the key of an object iterator
+    auto
+    key() const -> decltype(std::declval<Base>().key()) {
+        auto it = --this->base();
+        return it.key();
+    }
+
+    /// return the value of an iterator
+    reference
+    value() const {
+        auto it = --this->base();
+        return it.operator*();
+    }
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/json_custom_base_class.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <type_traits>  // conditional, is_same
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/*!
+@brief Default base class of the @ref basic_json class.
+
+So that the correct implementations of the copy / move ctors / assign operators
+of @ref basic_json do not require complex case distinctions
+(no base class / custom base class used as customization point),
+@ref basic_json always has a base class.
+By default, this class is used because it is empty and thus has no effect
+on the behavior of @ref basic_json.
+*/
+struct json_default_base {};
+
+template <class T>
+using json_base_class = typename std::conditional<std::is_same<T, void>::value, json_default_base, T>::type;
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/json_pointer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>  // all_of
+#include <cctype>     // isdigit
+#include <cerrno>     // errno, ERANGE
+#include <cstdlib>    // strtoull
+#ifndef JSON_NO_IO
+#include <iosfwd>   // ostream
+#endif              // JSON_NO_IO
+#include <limits>   // max
+#include <numeric>  // accumulate
+#include <string>   // string
+#include <utility>  // move
+#include <vector>   // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
+/// @sa https://json.nlohmann.me/api/json_pointer/
+template <typename RefStringType>
+class json_pointer {
+    // allow basic_json to access private members
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    friend class basic_json;
+
+    template <typename>
+    friend class json_pointer;
+
+    template <typename T>
+    struct string_t_helper {
+        using type = T;
+    };
+
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    struct string_t_helper<NLOHMANN_BASIC_JSON_TPL> {
+        using type = StringType;
+    };
+
+   public:
+    // for backwards compatibility accept BasicJsonType
+    using string_t = typename string_t_helper<RefStringType>::type;
+
+    /// @brief create JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/json_pointer/
+    explicit json_pointer(const string_t& s = "") : reference_tokens(split(s)) {}
+
+    /// @brief return a string representation of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/to_string/
+    string_t
+    to_string() const {
+        return std::accumulate(
+            reference_tokens.begin(), reference_tokens.end(), string_t{}, [](const string_t& a, const string_t& b) {
+                return detail::concat(a, '/', detail::escape(b));
+            });
+    }
+
+    /// @brief return a string representation of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_string/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, to_string())
+    operator string_t() const { return to_string(); }
+
+#ifndef JSON_NO_IO
+    /// @brief write string representation of the JSON pointer to stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
+    friend std::ostream&
+    operator<<(std::ostream& o, const json_pointer& ptr) {
+        o << ptr.to_string();
+        return o;
+    }
+#endif
+
+    /// @brief append another JSON pointer at the end of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
+    json_pointer&
+    operator/=(const json_pointer& ptr) {
+        reference_tokens.insert(reference_tokens.end(), ptr.reference_tokens.begin(), ptr.reference_tokens.end());
+        return *this;
+    }
+
+    /// @brief append an unescaped reference token at the end of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
+    json_pointer&
+    operator/=(string_t token) {
+        push_back(std::move(token));
+        return *this;
+    }
+
+    /// @brief append an array index at the end of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slasheq/
+    json_pointer&
+    operator/=(std::size_t array_idx) {
+        return *this /= std::to_string(array_idx);
+    }
+
+    /// @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
+    friend json_pointer
+    operator/(const json_pointer& lhs, const json_pointer& rhs) {
+        return json_pointer(lhs) /= rhs;
+    }
+
+    /// @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
+    friend json_pointer
+    operator/(const json_pointer& lhs, string_t token)  // NOLINT(performance-unnecessary-value-param)
+    {
+        return json_pointer(lhs) /= std::move(token);
+    }
+
+    /// @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_slash/
+    friend json_pointer
+    operator/(const json_pointer& lhs, std::size_t array_idx) {
+        return json_pointer(lhs) /= array_idx;
+    }
+
+    /// @brief returns the parent of this JSON pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/parent_pointer/
+    json_pointer
+    parent_pointer() const {
+        if (empty()) {
+            return *this;
+        }
+
+        json_pointer res = *this;
+        res.pop_back();
+        return res;
+    }
+
+    /// @brief remove last reference token
+    /// @sa https://json.nlohmann.me/api/json_pointer/pop_back/
+    void
+    pop_back() {
+        if (JSON_HEDLEY_UNLIKELY(empty())) {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
+        }
+
+        reference_tokens.pop_back();
+    }
+
+    /// @brief return last reference token
+    /// @sa https://json.nlohmann.me/api/json_pointer/back/
+    const string_t&
+    back() const {
+        if (JSON_HEDLEY_UNLIKELY(empty())) {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
+        }
+
+        return reference_tokens.back();
+    }
+
+    /// @brief append an unescaped token at the end of the reference pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
+    void
+    push_back(const string_t& token) {
+        reference_tokens.push_back(token);
+    }
+
+    /// @brief append an unescaped token at the end of the reference pointer
+    /// @sa https://json.nlohmann.me/api/json_pointer/push_back/
+    void
+    push_back(string_t&& token) {
+        reference_tokens.push_back(std::move(token));
+    }
+
+    /// @brief return whether pointer points to the root document
+    /// @sa https://json.nlohmann.me/api/json_pointer/empty/
+    bool
+    empty() const noexcept {
+        return reference_tokens.empty();
+    }
+
+   private:
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw parse_error.106  if an array index begins with '0'
+    @throw parse_error.109  if an array index begins not with a digit
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    @throw out_of_range.410 if an array index exceeds size_type
+    */
+    template <typename BasicJsonType>
+    static typename BasicJsonType::size_type
+    array_index(const string_t& s) {
+        using size_type = typename BasicJsonType::size_type;
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0')) {
+            JSON_THROW(detail::parse_error::create(
+                106, 0, detail::concat("array index '", s, "' must not begin with '0'"), nullptr));
+        }
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9'))) {
+            JSON_THROW(
+                detail::parse_error::create(109, 0, detail::concat("array index '", s, "' is not a number"), nullptr));
+        }
+
+        const char* p                = s.c_str();
+        char* p_end                  = nullptr;
+        errno                        = 0;                                              // strtoull doesn't reset errno
+        const unsigned long long res = std::strtoull(p, &p_end, 10);                   // NOLINT(runtime/int)
+        if (p == p_end                                                                 // invalid input or empty string
+            || errno == ERANGE                                                         // out of range
+            || JSON_HEDLEY_UNLIKELY(static_cast<std::size_t>(p_end - p) != s.size()))  // incomplete read
+        {
+            JSON_THROW(
+                detail::out_of_range::create(404, detail::concat("unresolved reference token '", s, "'"), nullptr));
+        }
+
+        // only triggered on special platforms (like 32bit), see also
+        // https://github.com/nlohmann/json/pull/2203
+        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))  // NOLINT(runtime/int)
+        {
+            JSON_THROW(detail::out_of_range::create(
+                410, detail::concat("array index ", s, " exceeds size_type"), nullptr));  // LCOV_EXCL_LINE
+        }
+
+        return static_cast<size_type>(res);
+    }
+
+    JSON_PRIVATE_UNLESS_TESTED : json_pointer
+                                 top() const {
+        if (JSON_HEDLEY_UNLIKELY(empty())) {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", nullptr));
+        }
+
+        json_pointer result     = *this;
+        result.reference_tokens = {reference_tokens[0]};
+        return result;
+    }
+
+   private:
+    /*!
+    @brief create and return a reference to the pointed to value
+
+    @complexity Linear in the number of reference tokens.
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.313 if value cannot be unflattened
+    */
+    template <typename BasicJsonType>
+    BasicJsonType&
+    get_and_create(BasicJsonType& j) const {
+        auto* result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens) {
+            switch (result->type()) {
+                case detail::value_t::null: {
+                    if (reference_token == "0") {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    } else {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object: {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array: {
+                    // create an entry in the array
+                    result = &result->operator[](array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", &j));
+            }
+        }
+
+        return *result;
+    }
+
+    /*!
+    @brief return a reference to the pointed to value
+
+    @note This version does not throw if a value is not present, but tries to
+          create nested values instead. For instance, calling this function
+          with pointer `"/this/that"` on a null value is equivalent to calling
+          `operator[]("this").operator[]("that")` on that value, effectively
+          changing the null value to an object.
+
+    @param[in] ptr  a JSON value
+
+    @return reference to the JSON value pointed to by the JSON pointer
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template <typename BasicJsonType>
+    BasicJsonType&
+    get_unchecked(BasicJsonType* ptr) const {
+        for (const auto& reference_token : reference_tokens) {
+            // convert null values to arrays or objects before continuing
+            if (ptr->is_null()) {
+                // check if reference token is a number
+                const bool nums = std::all_of(reference_token.begin(),
+                                              reference_token.end(),
+                                              [](const unsigned char x) { return std::isdigit(x); });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums || reference_token == "-") ? detail::value_t::array : detail::value_t::object;
+            }
+
+            switch (ptr->type()) {
+                case detail::value_t::object: {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array: {
+                    if (reference_token == "-") {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_data.m_value.array->size());
+                    } else {
+                        // convert array index to number; unchecked access
+                        ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
+                    }
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(
+                        404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template <typename BasicJsonType>
+    BasicJsonType&
+    get_checked(BasicJsonType* ptr) const {
+        for (const auto& reference_token : reference_tokens) {
+            switch (ptr->type()) {
+                case detail::value_t::object: {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array: {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(
+                            402,
+                            detail::concat("array index '-' (",
+                                           std::to_string(ptr->m_data.m_value.array->size()),
+                                           ") is out of range"),
+                            ptr));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(
+                        404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @brief return a const reference to the pointed to value
+
+    @param[in] ptr  a JSON value
+
+    @return const reference to the JSON value pointed to by the JSON
+    pointer
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template <typename BasicJsonType>
+    const BasicJsonType&
+    get_unchecked(const BasicJsonType* ptr) const {
+        for (const auto& reference_token : reference_tokens) {
+            switch (ptr->type()) {
+                case detail::value_t::object: {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array: {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(
+                            402,
+                            detail::concat("array index '-' (",
+                                           std::to_string(ptr->m_data.m_value.array->size()),
+                                           ") is out of range"),
+                            ptr));
+                    }
+
+                    // use unchecked array access
+                    ptr = &ptr->operator[](array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(
+                        404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    template <typename BasicJsonType>
+    const BasicJsonType&
+    get_checked(const BasicJsonType* ptr) const {
+        for (const auto& reference_token : reference_tokens) {
+            switch (ptr->type()) {
+                case detail::value_t::object: {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array: {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(
+                            402,
+                            detail::concat("array index '-' (",
+                                           std::to_string(ptr->m_data.m_value.array->size()),
+                                           ") is out of range"),
+                            ptr));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index<BasicJsonType>(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(
+                        404, detail::concat("unresolved reference token '", reference_token, "'"), ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    */
+    template <typename BasicJsonType>
+    bool
+    contains(const BasicJsonType* ptr) const {
+        for (const auto& reference_token : reference_tokens) {
+            switch (ptr->type()) {
+                case detail::value_t::object: {
+                    if (!ptr->contains(reference_token)) {
+                        // we did not find the key in the object
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array: {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) {
+                        // "-" always fails the range check
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 &&
+                                             !("0" <= reference_token && reference_token <= "9"))) {
+                        // invalid char
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1)) {
+                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9'))) {
+                            // first char should be between '1' and '9'
+                            return false;
+                        }
+                        for (std::size_t i = 1; i < reference_token.size(); i++) {
+                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9'))) {
+                                // other char should be between '0' and '9'
+                                return false;
+                            }
+                        }
+                    }
+
+                    const auto idx = array_index<BasicJsonType>(reference_token);
+                    if (idx >= ptr->size()) {
+                        // index out of range
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](idx);
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default: {
+                    // we do not expect primitive values if there is still a
+                    // reference token to process
+                    return false;
+                }
+            }
+        }
+
+        // no reference token left means we found a primitive value
+        return true;
+    }
+
+    /*!
+    @brief split the string input to reference tokens
+
+    @note This function is only called by the json_pointer constructor.
+          All exceptions below are documented there.
+
+    @throw parse_error.107  if the pointer is not empty or begins with '/'
+    @throw parse_error.108  if character '~' is not followed by '0' or '1'
+    */
+    static std::vector<string_t>
+    split(const string_t& reference_string) {
+        std::vector<string_t> result;
+
+        // special case: empty reference string -> no reference tokens
+        if (reference_string.empty()) {
+            return result;
+        }
+
+        // check if nonempty reference string begins with slash
+        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/')) {
+            JSON_THROW(detail::parse_error::create(
+                107,
+                1,
+                detail::concat("JSON pointer must be empty or begin with '/' - was: '", reference_string, "'"),
+                nullptr));
+        }
+
+        // extract the reference tokens:
+        // - slash: position of the last read slash (or end of string)
+        // - start: position after the previous slash
+        for (
+            // search for the first slash after the first character
+            std::size_t slash = reference_string.find_first_of('/', 1),
+                        // set the beginning of the first reference token
+            start = 1;
+            // we can stop if start == 0 (if slash == string_t::npos)
+            start != 0;
+            // set the beginning of the next reference token
+            // (will eventually be 0 if slash == string_t::npos)
+            start = (slash == string_t::npos) ? 0 : slash + 1,
+                        // find next slash
+            slash = reference_string.find_first_of('/', start)) {
+            // use the text between the beginning of the reference token
+            // (start) and the last slash (slash).
+            auto reference_token = reference_string.substr(start, slash - start);
+
+            // check reference tokens are properly escaped
+            for (std::size_t pos = reference_token.find_first_of('~'); pos != string_t::npos;
+                 pos             = reference_token.find_first_of('~', pos + 1)) {
+                JSON_ASSERT(reference_token[pos] == '~');
+
+                // ~ must be followed by 0 or 1
+                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
+                                         (reference_token[pos + 1] != '0' && reference_token[pos + 1] != '1'))) {
+                    JSON_THROW(detail::parse_error::create(
+                        108, 0, "escape character '~' must be followed with '0' or '1'", nullptr));
+                }
+            }
+
+            // finally, store the reference token
+            detail::unescape(reference_token);
+            result.push_back(reference_token);
+        }
+
+        return result;
+    }
+
+   private:
+    /*!
+    @param[in] reference_string  the reference string to the current value
+    @param[in] value             the value to consider
+    @param[in,out] result        the result object to insert values to
+
+    @note Empty objects or arrays are flattened to `null`.
+    */
+    template <typename BasicJsonType>
+    static void
+    flatten(const string_t& reference_string, const BasicJsonType& value, BasicJsonType& result) {
+        switch (value.type()) {
+            case detail::value_t::array: {
+                if (value.m_data.m_value.array->empty()) {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                } else {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i) {
+                        flatten(detail::concat(reference_string, '/', std::to_string(i)),
+                                value.m_data.m_value.array->operator[](i),
+                                result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object: {
+                if (value.m_data.m_value.object->empty()) {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                } else {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_data.m_value.object) {
+                        flatten(detail::concat(reference_string, '/', detail::escape(element.first)),
+                                element.second,
+                                result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::null:
+            case detail::value_t::string:
+            case detail::value_t::boolean:
+            case detail::value_t::number_integer:
+            case detail::value_t::number_unsigned:
+            case detail::value_t::number_float:
+            case detail::value_t::binary:
+            case detail::value_t::discarded:
+            default: {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @param[in] value  flattened JSON
+
+    @return unflattened JSON
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+    @throw type_error.313  if value cannot be unflattened
+    */
+    template <typename BasicJsonType>
+    static BasicJsonType
+    unflatten(const BasicJsonType& value) {
+        if (JSON_HEDLEY_UNLIKELY(!value.is_object())) {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", &value));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_data.m_value.object) {
+            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive())) {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive", &element.second));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
+
+    // can't use conversion operator because of ambiguity
+    json_pointer<string_t>
+    convert() const& {
+        json_pointer<string_t> result;
+        result.reference_tokens = reference_tokens;
+        return result;
+    }
+
+    json_pointer<string_t>
+    convert() && {
+        json_pointer<string_t> result;
+        result.reference_tokens = std::move(reference_tokens);
+        return result;
+    }
+
+   public:
+#if JSON_HAS_THREE_WAY_COMPARISON
+    /// @brief compares two JSON pointers for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template <typename RefStringTypeRhs>
+    bool
+    operator==(const json_pointer<RefStringTypeRhs>& rhs) const noexcept {
+        return reference_tokens == rhs.reference_tokens;
+    }
+
+    /// @brief compares JSON pointer and string for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer))
+    bool
+    operator==(const string_t& rhs) const {
+        return *this == json_pointer(rhs);
+    }
+
+    /// @brief 3-way compares two JSON pointers
+    template <typename RefStringTypeRhs>
+    std::strong_ordering
+    operator<=>(const json_pointer<RefStringTypeRhs>& rhs) const noexcept  // *NOPAD*
+    {
+        return reference_tokens <=> rhs.reference_tokens;  // *NOPAD*
+    }
+#else
+    /// @brief compares two JSON pointers for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template <typename RefStringTypeLhs, typename RefStringTypeRhs>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator==(const json_pointer<RefStringTypeLhs>& lhs, const json_pointer<RefStringTypeRhs>& rhs) noexcept;
+
+    /// @brief compares JSON pointer and string for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template <typename RefStringTypeLhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator==(const json_pointer<RefStringTypeLhs>& lhs, const StringType& rhs);
+
+    /// @brief compares string and JSON pointer for equality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_eq/
+    template <typename RefStringTypeRhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator==(const StringType& lhs, const json_pointer<RefStringTypeRhs>& rhs);
+
+    /// @brief compares two JSON pointers for inequality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
+    template <typename RefStringTypeLhs, typename RefStringTypeRhs>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator!=(const json_pointer<RefStringTypeLhs>& lhs, const json_pointer<RefStringTypeRhs>& rhs) noexcept;
+
+    /// @brief compares JSON pointer and string for inequality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
+    template <typename RefStringTypeLhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator!=(const json_pointer<RefStringTypeLhs>& lhs, const StringType& rhs);
+
+    /// @brief compares string and JSON pointer for inequality
+    /// @sa https://json.nlohmann.me/api/json_pointer/operator_ne/
+    template <typename RefStringTypeRhs, typename StringType>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator!=(const StringType& lhs, const json_pointer<RefStringTypeRhs>& rhs);
+
+    /// @brief compares two JSON pointer for less-than
+    template <typename RefStringTypeLhs, typename RefStringTypeRhs>
+    // NOLINTNEXTLINE(readability-redundant-declaration)
+    friend bool
+    operator<(const json_pointer<RefStringTypeLhs>& lhs, const json_pointer<RefStringTypeRhs>& rhs) noexcept;
+#endif
+
+   private:
+    /// the reference tokens
+    std::vector<string_t> reference_tokens;
+};
+
+#if !JSON_HAS_THREE_WAY_COMPARISON
+// functions cannot be defined inside class due to ODR violations
+template <typename RefStringTypeLhs, typename RefStringTypeRhs>
+inline bool
+operator==(const json_pointer<RefStringTypeLhs>& lhs, const json_pointer<RefStringTypeRhs>& rhs) noexcept {
+    return lhs.reference_tokens == rhs.reference_tokens;
+}
+
+template <typename RefStringTypeLhs, typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
+inline bool
+operator==(const json_pointer<RefStringTypeLhs>& lhs, const StringType& rhs) {
+    return lhs == json_pointer<RefStringTypeLhs>(rhs);
+}
+
+template <typename RefStringTypeRhs, typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator==(json_pointer, json_pointer))
+inline bool
+operator==(const StringType& lhs, const json_pointer<RefStringTypeRhs>& rhs) {
+    return json_pointer<RefStringTypeRhs>(lhs) == rhs;
+}
+
+template <typename RefStringTypeLhs, typename RefStringTypeRhs>
+inline bool
+operator!=(const json_pointer<RefStringTypeLhs>& lhs, const json_pointer<RefStringTypeRhs>& rhs) noexcept {
+    return !(lhs == rhs);
+}
+
+template <typename RefStringTypeLhs, typename StringType = typename json_pointer<RefStringTypeLhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
+inline bool
+operator!=(const json_pointer<RefStringTypeLhs>& lhs, const StringType& rhs) {
+    return !(lhs == rhs);
+}
+
+template <typename RefStringTypeRhs, typename StringType = typename json_pointer<RefStringTypeRhs>::string_t>
+JSON_HEDLEY_DEPRECATED_FOR(3.11.2, operator!=(json_pointer, json_pointer))
+inline bool
+operator!=(const StringType& lhs, const json_pointer<RefStringTypeRhs>& rhs) {
+    return !(lhs == rhs);
+}
+
+template <typename RefStringTypeLhs, typename RefStringTypeRhs>
+inline bool
+operator<(const json_pointer<RefStringTypeLhs>& lhs, const json_pointer<RefStringTypeRhs>& rhs) noexcept {
+    return lhs.reference_tokens < rhs.reference_tokens;
+}
+#endif
+
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/json_ref.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <initializer_list>
+#include <utility>
+
+// #include <nlohmann/detail/abi_macros.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+template <typename BasicJsonType>
+class json_ref {
+   public:
+    using value_type = BasicJsonType;
+
+    json_ref(value_type&& value) : owned_value(std::move(value)) {}
+
+    json_ref(const value_type& value) : value_ref(&value) {}
+
+    json_ref(std::initializer_list<json_ref> init) : owned_value(init) {}
+
+    template <class... Args, enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0>
+    json_ref(Args&&... args) : owned_value(std::forward<Args>(args)...) {}
+
+    // class should be movable only
+    json_ref(json_ref&&) noexcept = default;
+    json_ref(const json_ref&)     = delete;
+    json_ref&
+    operator=(const json_ref&) = delete;
+    json_ref&
+    operator=(json_ref&&) = delete;
+    ~json_ref()           = default;
+
+    value_type
+    moved_or_copied() const {
+        if (value_ref == nullptr) {
+            return std::move(owned_value);
+        }
+        return *value_ref;
+    }
+
+    value_type const&
+    operator*() const {
+        return value_ref ? *value_ref : owned_value;
+    }
+
+    value_type const*
+    operator->() const {
+        return &**this;
+    }
+
+   private:
+    mutable value_type owned_value = nullptr;
+    value_type const* value_ref    = nullptr;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>  // reverse
+#include <array>      // array
+#include <map>        // map
+#include <cmath>      // isnan, isinf
+#include <cstdint>    // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring>    // memcpy
+#include <limits>     // numeric_limits
+#include <string>     // string
+#include <utility>    // move
+#include <vector>     // vector
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>  // copy
+#include <cstddef>    // size_t
+#include <iterator>   // back_inserter
+#include <memory>     // shared_ptr, make_shared
+#include <string>     // basic_string
+#include <vector>     // vector
+
+#ifndef JSON_NO_IO
+#include <ios>      // streamsize
+#include <ostream>  // basic_ostream
+#endif              // JSON_NO_IO
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/// abstract output adapter interface
+template <typename CharType>
+struct output_adapter_protocol {
+    virtual void
+    write_character(CharType c) = 0;
+    virtual void
+    write_characters(const CharType* s, std::size_t length) = 0;
+    virtual ~output_adapter_protocol()                      = default;
+
+    output_adapter_protocol()                                   = default;
+    output_adapter_protocol(const output_adapter_protocol&)     = default;
+    output_adapter_protocol(output_adapter_protocol&&) noexcept = default;
+    output_adapter_protocol&
+    operator=(const output_adapter_protocol&) = default;
+    output_adapter_protocol&
+    operator=(output_adapter_protocol&&) noexcept = default;
+};
+
+/// a type to simplify interfaces
+template <typename CharType>
+using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
+
+/// output adapter for byte vectors
+template <typename CharType, typename AllocatorType = std::allocator<CharType>>
+class output_vector_adapter : public output_adapter_protocol<CharType> {
+   public:
+    explicit output_vector_adapter(std::vector<CharType, AllocatorType>& vec) noexcept : v(vec) {}
+
+    void
+    write_character(CharType c) override {
+        v.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void
+    write_characters(const CharType* s, std::size_t length) override {
+        v.insert(v.end(), s, s + length);
+    }
+
+   private:
+    std::vector<CharType, AllocatorType>& v;
+};
+
+#ifndef JSON_NO_IO
+/// output adapter for output streams
+template <typename CharType>
+class output_stream_adapter : public output_adapter_protocol<CharType> {
+   public:
+    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept : stream(s) {}
+
+    void
+    write_character(CharType c) override {
+        stream.put(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void
+    write_characters(const CharType* s, std::size_t length) override {
+        stream.write(s, static_cast<std::streamsize>(length));
+    }
+
+   private:
+    std::basic_ostream<CharType>& stream;
+};
+#endif  // JSON_NO_IO
+
+/// output adapter for basic_string
+template <typename CharType, typename StringType = std::basic_string<CharType>>
+class output_string_adapter : public output_adapter_protocol<CharType> {
+   public:
+    explicit output_string_adapter(StringType& s) noexcept : str(s) {}
+
+    void
+    write_character(CharType c) override {
+        str.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void
+    write_characters(const CharType* s, std::size_t length) override {
+        str.append(s, length);
+    }
+
+   private:
+    StringType& str;
+};
+
+template <typename CharType, typename StringType = std::basic_string<CharType>>
+class output_adapter {
+   public:
+    template <typename AllocatorType = std::allocator<CharType>>
+    output_adapter(std::vector<CharType, AllocatorType>& vec)
+        : oa(std::make_shared<output_vector_adapter<CharType, AllocatorType>>(vec)) {}
+
+#ifndef JSON_NO_IO
+    output_adapter(std::basic_ostream<CharType>& s) : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
+#endif  // JSON_NO_IO
+
+    output_adapter(StringType& s) : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
+
+    operator output_adapter_t<CharType>() { return oa; }
+
+   private:
+    output_adapter_t<CharType> oa = nullptr;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+///////////////////
+// binary writer //
+///////////////////
+
+/*!
+@brief serialization to CBOR and MessagePack values
+*/
+template <typename BasicJsonType, typename CharType>
+class binary_writer {
+    using string_t       = typename BasicJsonType::string_t;
+    using binary_t       = typename BasicJsonType::binary_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+   public:
+    /*!
+    @brief create a binary writer
+
+    @param[in] adapter  output adapter to write to
+    */
+    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(std::move(adapter)) { JSON_ASSERT(oa); }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void
+    write_bson(const BasicJsonType& j) {
+        switch (j.type()) {
+            case value_t::object: {
+                write_bson_object(*j.m_data.m_value.object);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::array:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                JSON_THROW(type_error::create(
+                    317, concat("to serialize to BSON, top-level type must be object, but is ", j.type_name()), &j));
+            }
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void
+    write_cbor(const BasicJsonType& j) {
+        switch (j.type()) {
+            case value_t::null: {
+                oa->write_character(to_char_type(0xF6));
+                break;
+            }
+
+            case value_t::boolean: {
+                oa->write_character(j.m_data.m_value.boolean ? to_char_type(0xF5) : to_char_type(0xF4));
+                break;
+            }
+
+            case value_t::number_integer: {
+                if (j.m_data.m_value.number_integer >= 0) {
+                    // CBOR does not differentiate between positive signed
+                    // integers and unsigned integers. Therefore, we used the
+                    // code from the value_t::number_unsigned case here.
+                    if (j.m_data.m_value.number_integer <= 0x17) {
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)()) {
+                        oa->write_character(to_char_type(0x18));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)()) {
+                        oa->write_character(to_char_type(0x19));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)()) {
+                        oa->write_character(to_char_type(0x1A));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
+                    } else {
+                        oa->write_character(to_char_type(0x1B));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
+                    }
+                } else {
+                    // The conversions below encode the sign in the first
+                    // byte, and the value is converted to a positive number.
+                    const auto positive_number = -1 - j.m_data.m_value.number_integer;
+                    if (j.m_data.m_value.number_integer >= -24) {
+                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
+                    } else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)()) {
+                        oa->write_character(to_char_type(0x38));
+                        write_number(static_cast<std::uint8_t>(positive_number));
+                    } else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)()) {
+                        oa->write_character(to_char_type(0x39));
+                        write_number(static_cast<std::uint16_t>(positive_number));
+                    } else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)()) {
+                        oa->write_character(to_char_type(0x3A));
+                        write_number(static_cast<std::uint32_t>(positive_number));
+                    } else {
+                        oa->write_character(to_char_type(0x3B));
+                        write_number(static_cast<std::uint64_t>(positive_number));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned: {
+                if (j.m_data.m_value.number_unsigned <= 0x17) {
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    oa->write_character(to_char_type(0x18));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_unsigned));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    oa->write_character(to_char_type(0x19));
+                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_unsigned));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    oa->write_character(to_char_type(0x1A));
+                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_unsigned));
+                } else {
+                    oa->write_character(to_char_type(0x1B));
+                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_unsigned));
+                }
+                break;
+            }
+
+            case value_t::number_float: {
+                if (std::isnan(j.m_data.m_value.number_float)) {
+                    // NaN is 0xf97e00 in CBOR
+                    oa->write_character(to_char_type(0xF9));
+                    oa->write_character(to_char_type(0x7E));
+                    oa->write_character(to_char_type(0x00));
+                } else if (std::isinf(j.m_data.m_value.number_float)) {
+                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
+                    oa->write_character(to_char_type(0xf9));
+                    oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
+                    oa->write_character(to_char_type(0x00));
+                } else {
+                    write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor);
+                }
+                break;
+            }
+
+            case value_t::string: {
+                // step 1: write control byte and the string length
+                const auto N = j.m_data.m_value.string->size();
+                if (N <= 0x17) {
+                    write_number(static_cast<std::uint8_t>(0x60 + N));
+                } else if (N <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    oa->write_character(to_char_type(0x78));
+                    write_number(static_cast<std::uint8_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    oa->write_character(to_char_type(0x79));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    oa->write_character(to_char_type(0x7A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)()) {
+                    oa->write_character(to_char_type(0x7B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write the string
+                oa->write_characters(reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                                     j.m_data.m_value.string->size());
+                break;
+            }
+
+            case value_t::array: {
+                // step 1: write control byte and the array size
+                const auto N = j.m_data.m_value.array->size();
+                if (N <= 0x17) {
+                    write_number(static_cast<std::uint8_t>(0x80 + N));
+                } else if (N <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    oa->write_character(to_char_type(0x98));
+                    write_number(static_cast<std::uint8_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    oa->write_character(to_char_type(0x99));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    oa->write_character(to_char_type(0x9A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)()) {
+                    oa->write_character(to_char_type(0x9B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.array) {
+                    write_cbor(el);
+                }
+                break;
+            }
+
+            case value_t::binary: {
+                if (j.m_data.m_value.binary->has_subtype()) {
+                    if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)()) {
+                        write_number(static_cast<std::uint8_t>(0xd8));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.binary->subtype()));
+                    } else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)()) {
+                        write_number(static_cast<std::uint8_t>(0xd9));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.binary->subtype()));
+                    } else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)()) {
+                        write_number(static_cast<std::uint8_t>(0xda));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.binary->subtype()));
+                    } else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)()) {
+                        write_number(static_cast<std::uint8_t>(0xdb));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.binary->subtype()));
+                    }
+                }
+
+                // step 1: write control byte and the binary array size
+                const auto N = j.m_data.m_value.binary->size();
+                if (N <= 0x17) {
+                    write_number(static_cast<std::uint8_t>(0x40 + N));
+                } else if (N <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    oa->write_character(to_char_type(0x58));
+                    write_number(static_cast<std::uint8_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    oa->write_character(to_char_type(0x59));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    oa->write_character(to_char_type(0x5A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)()) {
+                    oa->write_character(to_char_type(0x5B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                oa->write_characters(reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()), N);
+
+                break;
+            }
+
+            case value_t::object: {
+                // step 1: write control byte and the object size
+                const auto N = j.m_data.m_value.object->size();
+                if (N <= 0x17) {
+                    write_number(static_cast<std::uint8_t>(0xA0 + N));
+                } else if (N <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    oa->write_character(to_char_type(0xB8));
+                    write_number(static_cast<std::uint8_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    oa->write_character(to_char_type(0xB9));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    oa->write_character(to_char_type(0xBA));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)()) {
+                    oa->write_character(to_char_type(0xBB));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.object) {
+                    write_cbor(el.first);
+                    write_cbor(el.second);
+                }
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void
+    write_msgpack(const BasicJsonType& j) {
+        switch (j.type()) {
+            case value_t::null:  // nil
+            {
+                oa->write_character(to_char_type(0xC0));
+                break;
+            }
+
+            case value_t::boolean:  // true and false
+            {
+                oa->write_character(j.m_data.m_value.boolean ? to_char_type(0xC3) : to_char_type(0xC2));
+                break;
+            }
+
+            case value_t::number_integer: {
+                if (j.m_data.m_value.number_integer >= 0) {
+                    // MessagePack does not differentiate between positive
+                    // signed integers and unsigned integers. Therefore, we used
+                    // the code from the value_t::number_unsigned case here.
+                    if (j.m_data.m_value.number_unsigned < 128) {
+                        // positive fixnum
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)()) {
+                        // uint 8
+                        oa->write_character(to_char_type(0xCC));
+                        write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)()) {
+                        // uint 16
+                        oa->write_character(to_char_type(0xCD));
+                        write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)()) {
+                        // uint 32
+                        oa->write_character(to_char_type(0xCE));
+                        write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)()) {
+                        // uint 64
+                        oa->write_character(to_char_type(0xCF));
+                        write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
+                    }
+                } else {
+                    if (j.m_data.m_value.number_integer >= -32) {
+                        // negative fixnum
+                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
+                               j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)()) {
+                        // int 8
+                        oa->write_character(to_char_type(0xD0));
+                        write_number(static_cast<std::int8_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
+                               j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)()) {
+                        // int 16
+                        oa->write_character(to_char_type(0xD1));
+                        write_number(static_cast<std::int16_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
+                               j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)()) {
+                        // int 32
+                        oa->write_character(to_char_type(0xD2));
+                        write_number(static_cast<std::int32_t>(j.m_data.m_value.number_integer));
+                    } else if (j.m_data.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
+                               j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)()) {
+                        // int 64
+                        oa->write_character(to_char_type(0xD3));
+                        write_number(static_cast<std::int64_t>(j.m_data.m_value.number_integer));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned: {
+                if (j.m_data.m_value.number_unsigned < 128) {
+                    // positive fixnum
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    // uint 8
+                    oa->write_character(to_char_type(0xCC));
+                    write_number(static_cast<std::uint8_t>(j.m_data.m_value.number_integer));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    // uint 16
+                    oa->write_character(to_char_type(0xCD));
+                    write_number(static_cast<std::uint16_t>(j.m_data.m_value.number_integer));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    // uint 32
+                    oa->write_character(to_char_type(0xCE));
+                    write_number(static_cast<std::uint32_t>(j.m_data.m_value.number_integer));
+                } else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)()) {
+                    // uint 64
+                    oa->write_character(to_char_type(0xCF));
+                    write_number(static_cast<std::uint64_t>(j.m_data.m_value.number_integer));
+                }
+                break;
+            }
+
+            case value_t::number_float: {
+                write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack);
+                break;
+            }
+
+            case value_t::string: {
+                // step 1: write control byte and the string length
+                const auto N = j.m_data.m_value.string->size();
+                if (N <= 31) {
+                    // fixstr
+                    write_number(static_cast<std::uint8_t>(0xA0 | N));
+                } else if (N <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    // str 8
+                    oa->write_character(to_char_type(0xD9));
+                    write_number(static_cast<std::uint8_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    // str 16
+                    oa->write_character(to_char_type(0xDA));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    // str 32
+                    oa->write_character(to_char_type(0xDB));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write the string
+                oa->write_characters(reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                                     j.m_data.m_value.string->size());
+                break;
+            }
+
+            case value_t::array: {
+                // step 1: write control byte and the array size
+                const auto N = j.m_data.m_value.array->size();
+                if (N <= 15) {
+                    // fixarray
+                    write_number(static_cast<std::uint8_t>(0x90 | N));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    // array 16
+                    oa->write_character(to_char_type(0xDC));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    // array 32
+                    oa->write_character(to_char_type(0xDD));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.array) {
+                    write_msgpack(el);
+                }
+                break;
+            }
+
+            case value_t::binary: {
+                // step 0: determine if the binary type has a set subtype to
+                // determine whether or not to use the ext or fixext types
+                const bool use_ext = j.m_data.m_value.binary->has_subtype();
+
+                // step 1: write control byte and the byte string length
+                const auto N = j.m_data.m_value.binary->size();
+                if (N <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    std::uint8_t output_type{};
+                    bool fixed = true;
+                    if (use_ext) {
+                        switch (N) {
+                            case 1:
+                                output_type = 0xD4;  // fixext 1
+                                break;
+                            case 2:
+                                output_type = 0xD5;  // fixext 2
+                                break;
+                            case 4:
+                                output_type = 0xD6;  // fixext 4
+                                break;
+                            case 8:
+                                output_type = 0xD7;  // fixext 8
+                                break;
+                            case 16:
+                                output_type = 0xD8;  // fixext 16
+                                break;
+                            default:
+                                output_type = 0xC7;  // ext 8
+                                fixed       = false;
+                                break;
+                        }
+
+                    } else {
+                        output_type = 0xC4;  // bin 8
+                        fixed       = false;
+                    }
+
+                    oa->write_character(to_char_type(output_type));
+                    if (!fixed) {
+                        write_number(static_cast<std::uint8_t>(N));
+                    }
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    const std::uint8_t output_type = use_ext ? 0xC8   // ext 16
+                                                             : 0xC5;  // bin 16
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    const std::uint8_t output_type = use_ext ? 0xC9   // ext 32
+                                                             : 0xC6;  // bin 32
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 1.5: if this is an ext type, write the subtype
+                if (use_ext) {
+                    write_number(static_cast<std::int8_t>(j.m_data.m_value.binary->subtype()));
+                }
+
+                // step 2: write the byte string
+                oa->write_characters(reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()), N);
+
+                break;
+            }
+
+            case value_t::object: {
+                // step 1: write control byte and the object size
+                const auto N = j.m_data.m_value.object->size();
+                if (N <= 15) {
+                    // fixmap
+                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
+                } else if (N <= (std::numeric_limits<std::uint16_t>::max)()) {
+                    // map 16
+                    oa->write_character(to_char_type(0xDE));
+                    write_number(static_cast<std::uint16_t>(N));
+                } else if (N <= (std::numeric_limits<std::uint32_t>::max)()) {
+                    // map 32
+                    oa->write_character(to_char_type(0xDF));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_data.m_value.object) {
+                    write_msgpack(el.first);
+                    write_msgpack(el.second);
+                }
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    @param[in] use_bjdata  whether write in BJData format, default is false
+    */
+    void
+    write_ubjson(const BasicJsonType& j,
+                 const bool use_count,
+                 const bool use_type,
+                 const bool add_prefix = true,
+                 const bool use_bjdata = false) {
+        switch (j.type()) {
+            case value_t::null: {
+                if (add_prefix) {
+                    oa->write_character(to_char_type('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean: {
+                if (add_prefix) {
+                    oa->write_character(j.m_data.m_value.boolean ? to_char_type('T') : to_char_type('F'));
+                }
+                break;
+            }
+
+            case value_t::number_integer: {
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata);
+                break;
+            }
+
+            case value_t::number_unsigned: {
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata);
+                break;
+            }
+
+            case value_t::number_float: {
+                write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata);
+                break;
+            }
+
+            case value_t::string: {
+                if (add_prefix) {
+                    oa->write_character(to_char_type('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata);
+                oa->write_characters(reinterpret_cast<const CharType*>(j.m_data.m_value.string->c_str()),
+                                     j.m_data.m_value.string->size());
+                break;
+            }
+
+            case value_t::array: {
+                if (add_prefix) {
+                    oa->write_character(to_char_type('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_data.m_value.array->empty()) {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
+                    const bool same_prefix =
+                        std::all_of(j.begin() + 1, j.end(), [this, first_prefix, use_bjdata](const BasicJsonType& v) {
+                            return ubjson_prefix(v, use_bjdata) == first_prefix;
+                        });
+
+                    std::vector<CharType> bjdx = {
+                        '[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'};  // excluded markers in bjdata optimized type
+
+                    if (same_prefix &&
+                        !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end())) {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count) {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata);
+                }
+
+                for (const auto& el : *j.m_data.m_value.array) {
+                    write_ubjson(el, use_count, use_type, prefix_required, use_bjdata);
+                }
+
+                if (!use_count) {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::binary: {
+                if (add_prefix) {
+                    oa->write_character(to_char_type('['));
+                }
+
+                if (use_type && !j.m_data.m_value.binary->empty()) {
+                    JSON_ASSERT(use_count);
+                    oa->write_character(to_char_type('$'));
+                    oa->write_character('U');
+                }
+
+                if (use_count) {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata);
+                }
+
+                if (use_type) {
+                    oa->write_characters(reinterpret_cast<const CharType*>(j.m_data.m_value.binary->data()),
+                                         j.m_data.m_value.binary->size());
+                } else {
+                    for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i) {
+                        oa->write_character(to_char_type('U'));
+                        oa->write_character(j.m_data.m_value.binary->data()[i]);
+                    }
+                }
+
+                if (!use_count) {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object: {
+                if (use_bjdata && j.m_data.m_value.object->size() == 3 &&
+                    j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() &&
+                    j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() &&
+                    j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end()) {
+                    if (!write_bjdata_ndarray(*j.m_data.m_value.object,
+                                              use_count,
+                                              use_type))  // decode bjdata ndarray in the JData format
+                                                          // (https://github.com/NeuroJSON/jdata)
+                    {
+                        break;
+                    }
+                }
+
+                if (add_prefix) {
+                    oa->write_character(to_char_type('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_data.m_value.object->empty()) {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata);
+                    const bool same_prefix =
+                        std::all_of(j.begin(), j.end(), [this, first_prefix, use_bjdata](const BasicJsonType& v) {
+                            return ubjson_prefix(v, use_bjdata) == first_prefix;
+                        });
+
+                    std::vector<CharType> bjdx = {
+                        '[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'};  // excluded markers in bjdata optimized type
+
+                    if (same_prefix &&
+                        !(use_bjdata && std::find(bjdx.begin(), bjdx.end(), first_prefix) != bjdx.end())) {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count) {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata);
+                }
+
+                for (const auto& el : *j.m_data.m_value.object) {
+                    write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata);
+                    oa->write_characters(reinterpret_cast<const CharType*>(el.first.c_str()), el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required, use_bjdata);
+                }
+
+                if (!use_count) {
+                    oa->write_character(to_char_type('}'));
+                }
+
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+   private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @return The size of a BSON document entry header, including the id marker
+            and the entry name size (and its null-terminator).
+    */
+    static std::size_t
+    calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j) {
+        const auto it = name.find(static_cast<typename string_t::value_type>(0));
+        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos)) {
+            JSON_THROW(out_of_range::create(
+                409, concat("BSON key cannot contain code point U+0000 (at byte ", std::to_string(it), ")"), &j));
+            static_cast<void>(j);
+        }
+
+        return /*id*/ 1ul + name.size() + /*zero-terminator*/ 1u;
+    }
+
+    /*!
+    @brief Writes the given @a element_type and @a name to the output adapter
+    */
+    void
+    write_bson_entry_header(const string_t& name, const std::uint8_t element_type) {
+        oa->write_character(to_char_type(element_type));  // boolean
+        oa->write_characters(reinterpret_cast<const CharType*>(name.c_str()), name.size() + 1u);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and boolean value @a value
+    */
+    void
+    write_bson_boolean(const string_t& name, const bool value) {
+        write_bson_entry_header(name, 0x08);
+        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and double value @a value
+    */
+    void
+    write_bson_double(const string_t& name, const double value) {
+        write_bson_entry_header(name, 0x01);
+        write_number<double>(value, true);
+    }
+
+    /*!
+    @return The size of the BSON-encoded string in @a value
+    */
+    static std::size_t
+    calc_bson_string_size(const string_t& value) {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and string value @a value
+    */
+    void
+    write_bson_string(const string_t& name, const string_t& value) {
+        write_bson_entry_header(name, 0x02);
+
+        write_number<std::int32_t>(static_cast<std::int32_t>(value.size() + 1ul), true);
+        oa->write_characters(reinterpret_cast<const CharType*>(value.c_str()), value.size() + 1);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and null value
+    */
+    void
+    write_bson_null(const string_t& name) {
+        write_bson_entry_header(name, 0x0A);
+    }
+
+    /*!
+    @return The size of the BSON-encoded integer @a value
+    */
+    static std::size_t
+    calc_bson_integer_size(const std::int64_t value) {
+        return (std::numeric_limits<std::int32_t>::min)() <= value &&
+                       value <= (std::numeric_limits<std::int32_t>::max)()
+                   ? sizeof(std::int32_t)
+                   : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and integer @a value
+    */
+    void
+    write_bson_integer(const string_t& name, const std::int64_t value) {
+        if ((std::numeric_limits<std::int32_t>::min)() <= value &&
+            value <= (std::numeric_limits<std::int32_t>::max)()) {
+            write_bson_entry_header(name, 0x10);  // int32
+            write_number<std::int32_t>(static_cast<std::int32_t>(value), true);
+        } else {
+            write_bson_entry_header(name, 0x12);  // int64
+            write_number<std::int64_t>(static_cast<std::int64_t>(value), true);
+        }
+    }
+
+    /*!
+    @return The size of the BSON-encoded unsigned integer in @a j
+    */
+    static constexpr std::size_t
+    calc_bson_unsigned_size(const std::uint64_t value) noexcept {
+        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)())) ? sizeof(std::int32_t)
+                                                                                                 : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and unsigned @a value
+    */
+    void
+    write_bson_unsigned(const string_t& name, const BasicJsonType& j) {
+        if (j.m_data.m_value.number_unsigned <=
+            static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)())) {
+            write_bson_entry_header(name, 0x10 /* int32 */);
+            write_number<std::int32_t>(static_cast<std::int32_t>(j.m_data.m_value.number_unsigned), true);
+        } else if (j.m_data.m_value.number_unsigned <=
+                   static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)())) {
+            write_bson_entry_header(name, 0x12 /* int64 */);
+            write_number<std::int64_t>(static_cast<std::int64_t>(j.m_data.m_value.number_unsigned), true);
+        } else {
+            JSON_THROW(out_of_range::create(407,
+                                            concat("integer number ",
+                                                   std::to_string(j.m_data.m_value.number_unsigned),
+                                                   " cannot be represented by BSON as it does not fit int64"),
+                                            &j));
+        }
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and object @a value
+    */
+    void
+    write_bson_object_entry(const string_t& name, const typename BasicJsonType::object_t& value) {
+        write_bson_entry_header(name, 0x03);  // object
+        write_bson_object(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded array @a value
+    */
+    static std::size_t
+    calc_bson_array_size(const typename BasicJsonType::array_t& value) {
+        std::size_t array_index = 0ul;
+
+        const std::size_t embedded_document_size =
+            std::accumulate(std::begin(value),
+                            std::end(value),
+                            static_cast<std::size_t>(0),
+                            [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type& el) {
+                                return result + calc_bson_element_size(std::to_string(array_index++), el);
+                            });
+
+        return sizeof(std::int32_t) + embedded_document_size + 1ul;
+    }
+
+    /*!
+    @return The size of the BSON-encoded binary array @a value
+    */
+    static std::size_t
+    calc_bson_binary_size(const typename BasicJsonType::binary_t& value) {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and array @a value
+    */
+    void
+    write_bson_array(const string_t& name, const typename BasicJsonType::array_t& value) {
+        write_bson_entry_header(name, 0x04);  // array
+        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_array_size(value)), true);
+
+        std::size_t array_index = 0ul;
+
+        for (const auto& el : value) {
+            write_bson_element(std::to_string(array_index++), el);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and binary value @a value
+    */
+    void
+    write_bson_binary(const string_t& name, const binary_t& value) {
+        write_bson_entry_header(name, 0x05);
+
+        write_number<std::int32_t>(static_cast<std::int32_t>(value.size()), true);
+        write_number(value.has_subtype() ? static_cast<std::uint8_t>(value.subtype())
+                                         : static_cast<std::uint8_t>(0x00));
+
+        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
+    }
+
+    /*!
+    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
+    @return The calculated size for the BSON document entry for @a j with the given @a name.
+    */
+    static std::size_t
+    calc_bson_element_size(const string_t& name, const BasicJsonType& j) {
+        const auto header_size = calc_bson_entry_header_size(name, j);
+        switch (j.type()) {
+            case value_t::object:
+                return header_size + calc_bson_object_size(*j.m_data.m_value.object);
+
+            case value_t::array:
+                return header_size + calc_bson_array_size(*j.m_data.m_value.array);
+
+            case value_t::binary:
+                return header_size + calc_bson_binary_size(*j.m_data.m_value.binary);
+
+            case value_t::boolean:
+                return header_size + 1ul;
+
+            case value_t::number_float:
+                return header_size + 8ul;
+
+            case value_t::number_integer:
+                return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned);
+
+            case value_t::string:
+                return header_size + calc_bson_string_size(*j.m_data.m_value.string);
+
+            case value_t::null:
+                return header_size + 0ul;
+
+            // LCOV_EXCL_START
+            case value_t::discarded:
+            default:
+                JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
+                return 0ul;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Serializes the JSON value @a j to BSON and associates it with the
+           key @a name.
+    @param name The name to associate with the JSON entity @a j within the
+                current BSON document
+    */
+    void
+    write_bson_element(const string_t& name, const BasicJsonType& j) {
+        switch (j.type()) {
+            case value_t::object:
+                return write_bson_object_entry(name, *j.m_data.m_value.object);
+
+            case value_t::array:
+                return write_bson_array(name, *j.m_data.m_value.array);
+
+            case value_t::binary:
+                return write_bson_binary(name, *j.m_data.m_value.binary);
+
+            case value_t::boolean:
+                return write_bson_boolean(name, j.m_data.m_value.boolean);
+
+            case value_t::number_float:
+                return write_bson_double(name, j.m_data.m_value.number_float);
+
+            case value_t::number_integer:
+                return write_bson_integer(name, j.m_data.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return write_bson_unsigned(name, j);
+
+            case value_t::string:
+                return write_bson_string(name, *j.m_data.m_value.string);
+
+            case value_t::null:
+                return write_bson_null(name);
+
+            // LCOV_EXCL_START
+            case value_t::discarded:
+            default:
+                JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
+                return;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Calculates the size of the BSON serialization of the given
+           JSON-object @a j.
+    @param[in] value  JSON value to serialize
+    @pre       value.type() == value_t::object
+    */
+    static std::size_t
+    calc_bson_object_size(const typename BasicJsonType::object_t& value) {
+        const std::size_t document_size =
+            std::accumulate(value.begin(),
+                            value.end(),
+                            static_cast<std::size_t>(0),
+                            [](size_t result, const typename BasicJsonType::object_t::value_type& el) {
+                                return result += calc_bson_element_size(el.first, el.second);
+                            });
+
+        return sizeof(std::int32_t) + document_size + 1ul;
+    }
+
+    /*!
+    @param[in] value  JSON value to serialize
+    @pre       value.type() == value_t::object
+    */
+    void
+    write_bson_object(const typename BasicJsonType::object_t& value) {
+        write_number<std::int32_t>(static_cast<std::int32_t>(calc_bson_object_size(value)), true);
+
+        for (const auto& el : value) {
+            write_bson_element(el.first, el.second);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    static constexpr CharType
+    get_cbor_float_prefix(float /*unused*/) {
+        return to_char_type(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType
+    get_cbor_float_prefix(double /*unused*/) {
+        return to_char_type(0xFB);  // Double-Precision Float
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    static constexpr CharType
+    get_msgpack_float_prefix(float /*unused*/) {
+        return to_char_type(0xCA);  // float 32
+    }
+
+    static constexpr CharType
+    get_msgpack_float_prefix(double /*unused*/) {
+        return to_char_type(0xCB);  // float 64
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    // UBJSON: write number (floating point)
+    template <typename NumberType, typename std::enable_if<std::is_floating_point<NumberType>::value, int>::type = 0>
+    void
+    write_number_with_ubjson_prefix(const NumberType n, const bool add_prefix, const bool use_bjdata) {
+        if (add_prefix) {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n, use_bjdata);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template <typename NumberType, typename std::enable_if<std::is_unsigned<NumberType>::value, int>::type = 0>
+    void
+    write_number_with_ubjson_prefix(const NumberType n, const bool add_prefix, const bool use_bjdata) {
+        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::uint8_t>(n), use_bjdata);
+        } else if (n <= (std::numeric_limits<std::uint8_t>::max)()) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n), use_bjdata);
+        } else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n), use_bjdata);
+        } else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint16_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
+            }
+            write_number(static_cast<std::uint16_t>(n), use_bjdata);
+        } else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n), use_bjdata);
+        } else if (use_bjdata && n <= static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
+            }
+            write_number(static_cast<std::uint32_t>(n), use_bjdata);
+        } else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n), use_bjdata);
+        } else if (use_bjdata && n <= (std::numeric_limits<uint64_t>::max)()) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('M'));  // uint64 - bjdata only
+            }
+            write_number(static_cast<std::uint64_t>(n), use_bjdata);
+        } else {
+            if (add_prefix) {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
+            for (std::size_t i = 0; i < number.size(); ++i) {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template <typename NumberType,
+              typename std::enable_if<std::is_signed<NumberType>::value && !std::is_floating_point<NumberType>::value,
+                                      int>::type = 0>
+    void
+    write_number_with_ubjson_prefix(const NumberType n, const bool add_prefix, const bool use_bjdata) {
+        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)()) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::int8_t>(n), use_bjdata);
+        } else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n &&
+                   n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)())) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n), use_bjdata);
+        } else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)()) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n), use_bjdata);
+        } else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::min)()) <= n &&
+                                  n <= static_cast<std::int64_t>((std::numeric_limits<std::uint16_t>::max)()))) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('u'));  // uint16 - bjdata only
+            }
+            write_number(static_cast<uint16_t>(n), use_bjdata);
+        } else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)()) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n), use_bjdata);
+        } else if (use_bjdata && (static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::min)()) <= n &&
+                                  n <= static_cast<std::int64_t>((std::numeric_limits<std::uint32_t>::max)()))) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('m'));  // uint32 - bjdata only
+            }
+            write_number(static_cast<uint32_t>(n), use_bjdata);
+        } else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)()) {
+            if (add_prefix) {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n), use_bjdata);
+        }
+        // LCOV_EXCL_START
+        else {
+            if (add_prefix) {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true, use_bjdata);
+            for (std::size_t i = 0; i < number.size(); ++i) {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+    */
+    CharType
+    ubjson_prefix(const BasicJsonType& j, const bool use_bjdata) const noexcept {
+        switch (j.type()) {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_data.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer: {
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_data.m_value.number_integer &&
+                    j.m_data.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)()) {
+                    return 'i';
+                }
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_data.m_value.number_integer &&
+                    j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)()) {
+                    return 'U';
+                }
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_data.m_value.number_integer &&
+                    j.m_data.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)()) {
+                    return 'I';
+                }
+                if (use_bjdata && ((std::numeric_limits<std::uint16_t>::min)() <= j.m_data.m_value.number_integer &&
+                                   j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())) {
+                    return 'u';
+                }
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_data.m_value.number_integer &&
+                    j.m_data.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)()) {
+                    return 'l';
+                }
+                if (use_bjdata && ((std::numeric_limits<std::uint32_t>::min)() <= j.m_data.m_value.number_integer &&
+                                   j.m_data.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())) {
+                    return 'm';
+                }
+                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_data.m_value.number_integer &&
+                    j.m_data.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)()) {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H';  // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_unsigned: {
+                if (j.m_data.m_value.number_unsigned <=
+                    static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)())) {
+                    return 'i';
+                }
+                if (j.m_data.m_value.number_unsigned <=
+                    static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)())) {
+                    return 'U';
+                }
+                if (j.m_data.m_value.number_unsigned <=
+                    static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)())) {
+                    return 'I';
+                }
+                if (use_bjdata && j.m_data.m_value.number_unsigned <=
+                                      static_cast<std::uint64_t>((std::numeric_limits<std::uint16_t>::max)())) {
+                    return 'u';
+                }
+                if (j.m_data.m_value.number_unsigned <=
+                    static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)())) {
+                    return 'l';
+                }
+                if (use_bjdata && j.m_data.m_value.number_unsigned <=
+                                      static_cast<std::uint64_t>((std::numeric_limits<std::uint32_t>::max)())) {
+                    return 'm';
+                }
+                if (j.m_data.m_value.number_unsigned <=
+                    static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)())) {
+                    return 'L';
+                }
+                if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)()) {
+                    return 'M';
+                }
+                // anything else is treated as high-precision number
+                return 'H';  // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_data.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array:  // fallthrough
+            case value_t::binary:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            case value_t::discarded:
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType
+    get_ubjson_float_prefix(float /*unused*/) {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType
+    get_ubjson_float_prefix(double /*unused*/) {
+        return 'D';  // float 64
+    }
+
+    /*!
+    @return false if the object is successfully converted to a bjdata ndarray, true if the type or size is invalid
+    */
+    bool
+    write_bjdata_ndarray(const typename BasicJsonType::object_t& value, const bool use_count, const bool use_type) {
+        std::map<string_t, CharType> bjdtype = {{"uint8", 'U'},
+                                                {"int8", 'i'},
+                                                {"uint16", 'u'},
+                                                {"int16", 'I'},
+                                                {"uint32", 'm'},
+                                                {"int32", 'l'},
+                                                {"uint64", 'M'},
+                                                {"int64", 'L'},
+                                                {"single", 'd'},
+                                                {"double", 'D'},
+                                                {"char", 'C'}};
+
+        string_t key = "_ArrayType_";
+        auto it      = bjdtype.find(static_cast<string_t>(value.at(key)));
+        if (it == bjdtype.end()) {
+            return true;
+        }
+        CharType dtype = it->second;
+
+        key             = "_ArraySize_";
+        std::size_t len = (value.at(key).empty() ? 0 : 1);
+        for (const auto& el : value.at(key)) {
+            len *= static_cast<std::size_t>(el.m_data.m_value.number_unsigned);
+        }
+
+        key = "_ArrayData_";
+        if (value.at(key).size() != len) {
+            return true;
+        }
+
+        oa->write_character('[');
+        oa->write_character('$');
+        oa->write_character(dtype);
+        oa->write_character('#');
+
+        key = "_ArraySize_";
+        write_ubjson(value.at(key), use_count, use_type, true, true);
+
+        key = "_ArrayData_";
+        if (dtype == 'U' || dtype == 'C') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::uint8_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        } else if (dtype == 'i') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::int8_t>(el.m_data.m_value.number_integer), true);
+            }
+        } else if (dtype == 'u') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::uint16_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        } else if (dtype == 'I') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::int16_t>(el.m_data.m_value.number_integer), true);
+            }
+        } else if (dtype == 'm') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::uint32_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        } else if (dtype == 'l') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::int32_t>(el.m_data.m_value.number_integer), true);
+            }
+        } else if (dtype == 'M') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::uint64_t>(el.m_data.m_value.number_unsigned), true);
+            }
+        } else if (dtype == 'L') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<std::int64_t>(el.m_data.m_value.number_integer), true);
+            }
+        } else if (dtype == 'd') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<float>(el.m_data.m_value.number_float), true);
+            }
+        } else if (dtype == 'D') {
+            for (const auto& el : value.at(key)) {
+                write_number(static_cast<double>(el.m_data.m_value.number_float), true);
+            }
+        }
+        return false;
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*
+    @brief write a number to output input
+    @param[in] n number of type @a NumberType
+    @param[in] OutputIsLittleEndian Set to true if output data is
+                                 required to be little endian
+    @tparam NumberType the type of the number
+
+    @note This function needs to respect the system's endianness, because bytes
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
+          On the other hand, BSON and BJData use little endian and should reorder
+          on big endian systems.
+    */
+    template <typename NumberType>
+    void
+    write_number(const NumberType n, const bool OutputIsLittleEndian = false) {
+        // step 1: write number to array of length NumberType
+        std::array<CharType, sizeof(NumberType)> vec{};
+        std::memcpy(vec.data(), &n, sizeof(NumberType));
+
+        // step 2: write array to output (with possible reordering)
+        if (is_little_endian != OutputIsLittleEndian) {
+            // reverse byte order prior to conversion if necessary
+            std::reverse(vec.begin(), vec.end());
+        }
+
+        oa->write_characters(vec.data(), sizeof(NumberType));
+    }
+
+    void
+    write_compact_float(const number_float_t n, detail::input_format_t format) {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
+            static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
+            static_cast<double>(static_cast<float>(n)) == static_cast<double>(n)) {
+            oa->write_character(format == detail::input_format_t::cbor
+                                    ? get_cbor_float_prefix(static_cast<float>(n))
+                                    : get_msgpack_float_prefix(static_cast<float>(n)));
+            write_number(static_cast<float>(n));
+        } else {
+            oa->write_character(format == detail::input_format_t::cbor ? get_cbor_float_prefix(n)
+                                                                       : get_msgpack_float_prefix(n));
+            write_number(n);
+        }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+   public:
+    // The following to_char_type functions are implement the conversion
+    // between uint8_t and CharType. In case CharType is not unsigned,
+    // such a conversion is required to allow values greater than 128.
+    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
+    template <typename C = CharType, enable_if_t<std::is_signed<C>::value && std::is_signed<char>::value>* = nullptr>
+    static constexpr CharType
+    to_char_type(std::uint8_t x) noexcept {
+        return *reinterpret_cast<char*>(&x);
+    }
+
+    template <typename C = CharType, enable_if_t<std::is_signed<C>::value && std::is_unsigned<char>::value>* = nullptr>
+    static CharType
+    to_char_type(std::uint8_t x) noexcept {
+        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
+        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
+        CharType result;
+        std::memcpy(&result, &x, sizeof(x));
+        return result;
+    }
+
+    template <typename C = CharType, enable_if_t<std::is_unsigned<C>::value>* = nullptr>
+    static constexpr CharType
+    to_char_type(std::uint8_t x) noexcept {
+        return x;
+    }
+
+    template <typename InputCharType,
+              typename C                                                                            = CharType,
+              enable_if_t<std::is_signed<C>::value && std::is_signed<char>::value &&
+                          std::is_same<char, typename std::remove_cv<InputCharType>::type>::value>* = nullptr>
+    static constexpr CharType
+    to_char_type(InputCharType x) noexcept {
+        return x;
+    }
+
+   private:
+    /// whether we can assume little endianness
+    const bool is_little_endian = little_endianness();
+
+    /// the output
+    output_adapter_t<CharType> oa = nullptr;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/output/serializer.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2008-2009 Björn Hoehrmann <bjoern@hoehrmann.de>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>    // reverse, remove, fill, find, none_of
+#include <array>        // array
+#include <clocale>      // localeconv, lconv
+#include <cmath>        // labs, isfinite, isnan, signbit
+#include <cstddef>      // size_t, ptrdiff_t
+#include <cstdint>      // uint8_t
+#include <cstdio>       // snprintf
+#include <limits>       // numeric_limits
+#include <string>       // string, char_traits
+#include <iomanip>      // setfill, setw
+#include <type_traits>  // is_same
+#include <utility>      // move
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2009 Florian Loitsch <https://florian.loitsch.com/>
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <array>        // array
+#include <cmath>        // signbit, isfinite
+#include <cstdint>      // intN_t, uintN_t
+#include <cstring>      // memcpy, memmove
+#include <limits>       // numeric_limits
+#include <type_traits>  // conditional
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl {
+
+template <typename Target, typename Source>
+Target
+reinterpret_bits(const Source source) {
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp  // f * 2^e
+{
+    static constexpr int kPrecision = 64;  // = q
+
+    std::uint64_t f = 0;
+    int e           = 0;
+
+    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp
+    sub(const diyfp& x, const diyfp& y) noexcept {
+        JSON_ASSERT(x.e == y.e);
+        JSON_ASSERT(x.f >= y.f);
+
+        return {x.f - y.f, x.e};
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp
+    mul(const diyfp& x, const diyfp& y) noexcept {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+        const std::uint64_t u_hi = x.f >> 32u;
+        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+        const std::uint64_t v_hi = y.f >> 32u;
+
+        const std::uint64_t p0 = u_lo * v_lo;
+        const std::uint64_t p1 = u_lo * v_hi;
+        const std::uint64_t p2 = u_hi * v_lo;
+        const std::uint64_t p3 = u_hi * v_hi;
+
+        const std::uint64_t p0_hi = p0 >> 32u;
+        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+        const std::uint64_t p1_hi = p1 >> 32u;
+        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+        const std::uint64_t p2_hi = p2 >> 32u;
+
+        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += std::uint64_t{1} << (64u - 32u - 1u);  // round, ties up
+
+        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+        return {h, x.e + y.e + 64};
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp
+    normalize(diyfp x) noexcept {
+        JSON_ASSERT(x.f != 0);
+
+        while ((x.f >> 63u) == 0) {
+            x.f <<= 1u;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp
+    normalize_to(const diyfp& x, const int target_exponent) noexcept {
+        const int delta = x.e - target_exponent;
+
+        JSON_ASSERT(delta >= 0);
+        JSON_ASSERT(((x.f << delta) >> delta) == x.f);
+
+        return {x.f << delta, target_exponent};
+    }
+};
+
+struct boundaries {
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template <typename FloatType>
+boundaries
+compute_boundaries(FloatType value) {
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int kPrecision           = std::numeric_limits<FloatType>::digits;  // = p (includes the hidden bit)
+    constexpr int kBias                = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int kMinExp              = 1 - kBias;
+    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1);  // = 2^(p-1)
+
+    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t>::type;
+
+    const auto bits       = static_cast<std::uint64_t>(reinterpret_bits<bits_type>(value));
+    const std::uint64_t E = bits >> (kPrecision - 1);
+    const std::uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = E == 0;
+    const diyfp v          = is_denormal ? diyfp(F, kMinExp) : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = F == 0 && E > 1;
+    const diyfp m_plus                  = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_minus                 = lower_boundary_is_closer ? diyfp(4 * v.f - 1, v.e - 2)   // (B)
+                                                                   : diyfp(2 * v.f - 1, v.e - 1);  // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power  // c = f * 2^e ~= 10^k
+{
+    std::uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power
+get_cached_power_for_binary_exponent(int e) {
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep   = 8;
+
+    static constexpr std::array<cached_power, 79> kCachedPowers = {{
+        {0xAB70FE17C79AC6CA, -1060, -300}, {0xFF77B1FCBEBCDC4F, -1034, -292}, {0xBE5691EF416BD60C, -1007, -284},
+        {0x8DD01FAD907FFC3C, -980, -276},  {0xD3515C2831559A83, -954, -268},  {0x9D71AC8FADA6C9B5, -927, -260},
+        {0xEA9C227723EE8BCB, -901, -252},  {0xAECC49914078536D, -874, -244},  {0x823C12795DB6CE57, -847, -236},
+        {0xC21094364DFB5637, -821, -228},  {0x9096EA6F3848984F, -794, -220},  {0xD77485CB25823AC7, -768, -212},
+        {0xA086CFCD97BF97F4, -741, -204},  {0xEF340A98172AACE5, -715, -196},  {0xB23867FB2A35B28E, -688, -188},
+        {0x84C8D4DFD2C63F3B, -661, -180},  {0xC5DD44271AD3CDBA, -635, -172},  {0x936B9FCEBB25C996, -608, -164},
+        {0xDBAC6C247D62A584, -582, -156},  {0xA3AB66580D5FDAF6, -555, -148},  {0xF3E2F893DEC3F126, -529, -140},
+        {0xB5B5ADA8AAFF80B8, -502, -132},  {0x87625F056C7C4A8B, -475, -124},  {0xC9BCFF6034C13053, -449, -116},
+        {0x964E858C91BA2655, -422, -108},  {0xDFF9772470297EBD, -396, -100},  {0xA6DFBD9FB8E5B88F, -369, -92},
+        {0xF8A95FCF88747D94, -343, -84},   {0xB94470938FA89BCF, -316, -76},   {0x8A08F0F8BF0F156B, -289, -68},
+        {0xCDB02555653131B6, -263, -60},   {0x993FE2C6D07B7FAC, -236, -52},   {0xE45C10C42A2B3B06, -210, -44},
+        {0xAA242499697392D3, -183, -36},   {0xFD87B5F28300CA0E, -157, -28},   {0xBCE5086492111AEB, -130, -20},
+        {0x8CBCCC096F5088CC, -103, -12},   {0xD1B71758E219652C, -77, -4},     {0x9C40000000000000, -50, 4},
+        {0xE8D4A51000000000, -24, 12},     {0xAD78EBC5AC620000, 3, 20},       {0x813F3978F8940984, 30, 28},
+        {0xC097CE7BC90715B3, 56, 36},      {0x8F7E32CE7BEA5C70, 83, 44},      {0xD5D238A4ABE98068, 109, 52},
+        {0x9F4F2726179A2245, 136, 60},     {0xED63A231D4C4FB27, 162, 68},     {0xB0DE65388CC8ADA8, 189, 76},
+        {0x83C7088E1AAB65DB, 216, 84},     {0xC45D1DF942711D9A, 242, 92},     {0x924D692CA61BE758, 269, 100},
+        {0xDA01EE641A708DEA, 295, 108},    {0xA26DA3999AEF774A, 322, 116},    {0xF209787BB47D6B85, 348, 124},
+        {0xB454E4A179DD1877, 375, 132},    {0x865B86925B9BC5C2, 402, 140},    {0xC83553C5C8965D3D, 428, 148},
+        {0x952AB45CFA97A0B3, 455, 156},    {0xDE469FBD99A05FE3, 481, 164},    {0xA59BC234DB398C25, 508, 172},
+        {0xF6C69A72A3989F5C, 534, 180},    {0xB7DCBF5354E9BECE, 561, 188},    {0x88FCF317F22241E2, 588, 196},
+        {0xCC20CE9BD35C78A5, 614, 204},    {0x98165AF37B2153DF, 641, 212},    {0xE2A0B5DC971F303A, 667, 220},
+        {0xA8D9D1535CE3B396, 694, 228},    {0xFB9B7CD9A4A7443C, 720, 236},    {0xBB764C4CA7A44410, 747, 244},
+        {0x8BAB8EEFB6409C1A, 774, 252},    {0xD01FEF10A657842C, 800, 260},    {0x9B10A4E5E9913129, 827, 268},
+        {0xE7109BFBA19C0C9D, 853, 276},    {0xAC2820D9623BF429, 880, 284},    {0x80444B5E7AA7CF85, 907, 292},
+        {0xBF21E44003ACDD2D, 933, 300},    {0x8E679C2F5E44FF8F, 960, 308},    {0xD433179D9C8CB841, 986, 316},
+        {0x9E19DB92B4E31BA9, 1013, 324},
+    }};
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    JSON_ASSERT(e >= -1500);
+    JSON_ASSERT(e <= 1500);
+    const int f = kAlpha - e - 1;
+    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    JSON_ASSERT(index >= 0);
+    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());
+
+    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+    JSON_ASSERT(kAlpha <= cached.e + e + 64);
+    JSON_ASSERT(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int
+find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10) {
+    // LCOV_EXCL_START
+    if (n >= 1000000000) {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    if (n >= 100000000) {
+        pow10 = 100000000;
+        return 9;
+    }
+    if (n >= 10000000) {
+        pow10 = 10000000;
+        return 8;
+    }
+    if (n >= 1000000) {
+        pow10 = 1000000;
+        return 7;
+    }
+    if (n >= 100000) {
+        pow10 = 100000;
+        return 6;
+    }
+    if (n >= 10000) {
+        pow10 = 10000;
+        return 5;
+    }
+    if (n >= 1000) {
+        pow10 = 1000;
+        return 4;
+    }
+    if (n >= 100) {
+        pow10 = 100;
+        return 3;
+    }
+    if (n >= 10) {
+        pow10 = 10;
+        return 2;
+    }
+
+    pow10 = 1;
+    return 1;
+}
+
+inline void
+grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta, std::uint64_t rest, std::uint64_t ten_k) {
+    JSON_ASSERT(len >= 1);
+    JSON_ASSERT(dist <= delta);
+    JSON_ASSERT(rest <= delta);
+    JSON_ASSERT(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist && delta - rest >= ten_k && (rest + ten_k < dist || dist - rest > rest + ten_k - dist)) {
+        JSON_ASSERT(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void
+grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent, diyfp M_minus, diyfp w, diyfp M_plus) {
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    JSON_ASSERT(M_plus.e >= kAlpha);
+    JSON_ASSERT(M_plus.e <= kGamma);
+
+    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f;  // (significand of (M+ - M-), implicit exponent is e)
+    std::uint64_t dist  = diyfp::sub(M_plus, w).f;        // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+    auto p1 =
+        static_cast<std::uint32_t>(M_plus.f >> -one.e);  // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    std::uint64_t p2 = M_plus.f & (one.f - 1);           // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    JSON_ASSERT(p1 > 0);
+
+    std::uint32_t pow10{};
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0) {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d);  // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta) {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    JSON_ASSERT(p2 > delta);
+
+    int m = 0;
+    for (;;) {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
+        p2 *= 10;
+        const std::uint64_t d = p2 >> -one.e;      // d = (10 * p2) div 2^-e
+        const std::uint64_t r = p2 & (one.f - 1);  // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d);  // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist *= 10;
+        if (p2 <= delta) {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const std::uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline void
+grisu2(char* buf, int& len, int& decimal_exponent, diyfp m_minus, diyfp v, diyfp m_plus) {
+    JSON_ASSERT(m_plus.e == m_minus.e);
+    JSON_ASSERT(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e);  // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v, c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus, c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus(w_plus.f - 1, w_plus.e);
+
+    decimal_exponent = -cached.k;  // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template <typename FloatType>
+JSON_HEDLEY_NON_NULL(1)
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value) {
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes floats to doubles, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0  // NOLINT(readability-avoid-unconditional-preprocessor-if)
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char*
+append_exponent(char* buf, int e) {
+    JSON_ASSERT(e > -1000);
+    JSON_ASSERT(e < 1000);
+
+    if (e < 0) {
+        e      = -e;
+        *buf++ = '-';
+    } else {
+        *buf++ = '+';
+    }
+
+    auto k = static_cast<std::uint32_t>(e);
+    if (k < 10) {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    } else if (k < 100) {
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    } else {
+        *buf++ = static_cast<char>('0' + k / 100);
+        k %= 100;
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char*
+format_buffer(char* buf, int len, int decimal_exponent, int min_exp, int max_exp) {
+    JSON_ASSERT(min_exp < 0);
+    JSON_ASSERT(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n && n <= max_exp) {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (static_cast<size_t>(n) + 2);
+    }
+
+    if (0 < n && n <= max_exp) {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        JSON_ASSERT(k > n);
+
+        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
+        buf[n] = '.';
+        return buf + (static_cast<size_t>(k) + 1U);
+    }
+
+    if (min_exp < n && n <= 0) {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
+    }
+
+    if (k == 1) {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    } else {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
+        buf[1] = '.';
+        buf += 1 + static_cast<size_t>(k);
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+}  // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template <typename FloatType>
+JSON_HEDLEY_NON_NULL(1, 2)
+JSON_HEDLEY_RETURNS_NON_NULL char* to_chars(char* first, const char* last, FloatType value) {
+    static_cast<void>(last);  // maybe unused - fix warning
+    JSON_ASSERT(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value)) {
+        value    = -value;
+        *first++ = '-';
+    }
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+    if (value == 0)  // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len              = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    JSON_ASSERT(last - first >= kMaxExp + 2);
+    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/string_concat.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+namespace detail {
+
+///////////////////
+// serialization //
+///////////////////
+
+/// how to treat decoding errors
+enum class error_handler_t {
+    strict,   ///< throw a type_error exception in case of invalid UTF-8
+    replace,  ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore    ///< ignore invalid UTF-8 sequences
+};
+
+template <typename BasicJsonType>
+class serializer {
+    using string_t                            = typename BasicJsonType::string_t;
+    using number_float_t                      = typename BasicJsonType::number_float_t;
+    using number_integer_t                    = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t                   = typename BasicJsonType::number_unsigned_t;
+    using binary_char_t                       = typename BasicJsonType::binary_t::value_type;
+    static constexpr std::uint8_t UTF8_ACCEPT = 0;
+    static constexpr std::uint8_t UTF8_REJECT = 1;
+
+   public:
+    /*!
+    @param[in] s  output stream to serialize to
+    @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
+    */
+    serializer(output_adapter_t<char> s, const char ichar, error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s)),
+          loc(std::localeconv()),
+          thousands_sep(loc->thousands_sep == nullptr ? '\0'
+                                                      : std::char_traits<char>::to_char_type(*(loc->thousands_sep))),
+          decimal_point(loc->decimal_point == nullptr ? '\0'
+                                                      : std::char_traits<char>::to_char_type(*(loc->decimal_point))),
+          indent_char(ichar),
+          indent_string(512, indent_char),
+          error_handler(error_handler_) {}
+
+    // delete because of pointer members
+    serializer(const serializer&) = delete;
+    serializer&
+    operator=(const serializer&) = delete;
+    serializer(serializer&&)     = delete;
+    serializer&
+    operator=(serializer&&) = delete;
+    ~serializer()           = default;
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serialization internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is
+    called recursively.
+
+    - strings and object keys are escaped using `escape_string()`
+    - integer numbers are converted implicitly via `operator<<`
+    - floating-point numbers are converted to a string using `"%g"` format
+    - binary values are serialized as objects containing the subtype and the
+      byte array
+
+    @param[in] val               value to serialize
+    @param[in] pretty_print      whether the output shall be pretty-printed
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] indent_step       the indent level
+    @param[in] current_indent    the current indent level (only used internally)
+    */
+    void
+    dump(const BasicJsonType& val,
+         const bool pretty_print,
+         const bool ensure_ascii,
+         const unsigned int indent_step,
+         const unsigned int current_indent = 0) {
+        switch (val.m_data.m_type) {
+            case value_t::object: {
+                if (val.m_data.m_value.object->empty()) {
+                    o->write_characters("{}", 2);
+                    return;
+                }
+
+                if (pretty_print) {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent)) {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    auto i = val.m_data.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i) {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\": ", 3);
+                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\": ", 3);
+                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                } else {
+                    o->write_character('{');
+
+                    // first n-1 elements
+                    auto i = val.m_data.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i) {
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\":", 2);
+                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_data.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend());
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\":", 2);
+                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character('}');
+                }
+
+                return;
+            }
+
+            case value_t::array: {
+                if (val.m_data.m_value.array->empty()) {
+                    o->write_characters("[]", 2);
+                    return;
+                }
+
+                // Custom from FE: Added to reduce line bloat when dumping dims and strides
+                auto elementType = val.m_data.m_value.array->begin()->type();
+                if (pretty_print && (elementType != value_t::number_integer) &&
+                    (elementType != value_t::number_unsigned))
+                // if (pretty_print)
+                {
+                    o->write_characters("[\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent)) {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    for (auto i = val.m_data.m_value.array->cbegin(); i != val.m_data.m_value.array->cend() - 1; ++i) {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        dump(*i, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_data.m_value.array->empty());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character(']');
+                } else {
+                    o->write_character('[');
+
+                    // first n-1 elements
+                    for (auto i = val.m_data.m_value.array->cbegin(); i != val.m_data.m_value.array->cend() - 1; ++i) {
+                        dump(*i, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_data.m_value.array->empty());
+                    dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character(']');
+                }
+
+                return;
+            }
+
+            case value_t::string: {
+                o->write_character('\"');
+                dump_escaped(*val.m_data.m_value.string, ensure_ascii);
+                o->write_character('\"');
+                return;
+            }
+
+            case value_t::binary: {
+                if (pretty_print) {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent)) {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"bytes\": [", 10);
+
+                    if (!val.m_data.m_value.binary->empty()) {
+                        for (auto i = val.m_data.m_value.binary->cbegin(); i != val.m_data.m_value.binary->cend() - 1;
+                             ++i) {
+                            dump_integer(*i);
+                            o->write_characters(", ", 2);
+                        }
+                        dump_integer(val.m_data.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\n", 3);
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"subtype\": ", 11);
+                    if (val.m_data.m_value.binary->has_subtype()) {
+                        dump_integer(val.m_data.m_value.binary->subtype());
+                    } else {
+                        o->write_characters("null", 4);
+                    }
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                } else {
+                    o->write_characters("{\"bytes\":[", 10);
+
+                    if (!val.m_data.m_value.binary->empty()) {
+                        for (auto i = val.m_data.m_value.binary->cbegin(); i != val.m_data.m_value.binary->cend() - 1;
+                             ++i) {
+                            dump_integer(*i);
+                            o->write_character(',');
+                        }
+                        dump_integer(val.m_data.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\"subtype\":", 12);
+                    if (val.m_data.m_value.binary->has_subtype()) {
+                        dump_integer(val.m_data.m_value.binary->subtype());
+                        o->write_character('}');
+                    } else {
+                        o->write_characters("null}", 5);
+                    }
+                }
+                return;
+            }
+
+            case value_t::boolean: {
+                if (val.m_data.m_value.boolean) {
+                    o->write_characters("true", 4);
+                } else {
+                    o->write_characters("false", 5);
+                }
+                return;
+            }
+
+            case value_t::number_integer: {
+                dump_integer(val.m_data.m_value.number_integer);
+                return;
+            }
+
+            case value_t::number_unsigned: {
+                dump_integer(val.m_data.m_value.number_unsigned);
+                return;
+            }
+
+            case value_t::number_float: {
+                dump_float(val.m_data.m_value.number_float);
+                return;
+            }
+
+            case value_t::discarded: {
+                o->write_characters("<discarded>", 11);
+                return;
+            }
+
+            case value_t::null: {
+                o->write_characters("null", 4);
+                return;
+            }
+
+            default:                 // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+    }
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        /*!
+        @brief dump escaped string
+
+        Escape a string by replacing certain special characters by a sequence of an
+        escape character (backslash) and another character and other control
+        characters by a sequence of "\u" followed by a four-digit hex
+        representation. The escaped string is written to output stream @a o.
+
+        @param[in] s  the string to escape
+        @param[in] ensure_ascii  whether to escape non-ASCII characters with
+                                 \uXXXX sequences
+
+        @complexity Linear in the length of string @a s.
+        */
+        void
+        dump_escaped(const string_t& s, const bool ensure_ascii) {
+        std::uint32_t codepoint{};
+        std::uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes  = 0;  // number of bytes written to string_buffer
+
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars          = 0;
+
+        for (std::size_t i = 0; i < s.size(); ++i) {
+            const auto byte = static_cast<std::uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte)) {
+                case UTF8_ACCEPT:  // decode found a new code point
+                {
+                    switch (codepoint) {
+                        case 0x08:  // backspace
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
+                            break;
+                        }
+
+                        case 0x09:  // horizontal tab
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
+                        }
+
+                        case 0x0A:  // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C:  // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D:  // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22:  // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C:  // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default: {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F))) {
+                                if (codepoint <= 0xFFFF) {
+                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                                    static_cast<void>((std::snprintf)(string_buffer.data() + bytes,
+                                                                      7,
+                                                                      "\\u%04x",
+                                                                      static_cast<std::uint16_t>(codepoint)));
+                                    bytes += 6;
+                                } else {
+                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                                    static_cast<void>(
+                                        (std::snprintf)(string_buffer.data() + bytes,
+                                                        13,
+                                                        "\\u%04x\\u%04x",
+                                                        static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
+                                                        static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu))));
+                                    bytes += 12;
+                                }
+                            } else {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
+                    }
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13) {
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
+                    undumped_chars          = 0;
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    switch (error_handler) {
+                        case error_handler_t::strict: {
+                            JSON_THROW(type_error::create(
+                                316,
+                                concat("invalid UTF-8 byte at index ", std::to_string(i), ": 0x", hex_bytes(byte | 0)),
+                                nullptr));
+                        }
+
+                        case error_handler_t::ignore:
+                        case error_handler_t::replace: {
+                            // in case we saw this character the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (undumped_chars > 0) {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace) {
+                                // add a replacement character
+                                if (ensure_ascii) {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                } else {
+                                    string_buffer[bytes++] =
+                                        detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
+                                    string_buffer[bytes++] =
+                                        detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
+                                    string_buffer[bytes++] =
+                                        detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
+                                }
+
+                                // write buffer and reset index; there must be 13 bytes
+                                // left, as this is the maximal number of bytes to be
+                                // written ("\uxxxx\uxxxx\0") for one code point
+                                if (string_buffer.size() - bytes < 13) {
+                                    o->write_characters(string_buffer.data(), bytes);
+                                    bytes = 0;
+                                }
+
+                                bytes_after_last_accept = bytes;
+                            }
+
+                            undumped_chars = 0;
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:  // LCOV_EXCL_LINE
+                            JSON_ASSERT(
+                                false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (!ensure_ascii) {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
+                    }
+                    ++undumped_chars;
+                    break;
+                }
+            }
+        }
+
+        // we finished processing the string
+        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT)) {
+            // write buffer
+            if (bytes > 0) {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        } else {
+            // we finish reading, but do not accept: string was incomplete
+            switch (error_handler) {
+                case error_handler_t::strict: {
+                    JSON_THROW(type_error::create(316,
+                                                  concat("incomplete UTF-8 string; last byte: 0x",
+                                                         hex_bytes(static_cast<std::uint8_t>(s.back() | 0))),
+                                                  nullptr));
+                }
+
+                case error_handler_t::ignore: {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    break;
+                }
+
+                case error_handler_t::replace: {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
+                    if (ensure_ascii) {
+                        o->write_characters("\\ufffd", 6);
+                    } else {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
+                    break;
+                }
+
+                default:                 // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            }
+        }
+    }
+
+   private:
+    /*!
+    @brief count digits
+
+    Count the number of decimal (base 10) digits for an input unsigned integer.
+
+    @param[in] x  unsigned integer number to count its digits
+    @return    number of decimal digits
+    */
+    inline unsigned int
+    count_digits(number_unsigned_t x) noexcept {
+        unsigned int n_digits = 1;
+        for (;;) {
+            if (x < 10) {
+                return n_digits;
+            }
+            if (x < 100) {
+                return n_digits + 1;
+            }
+            if (x < 1000) {
+                return n_digits + 2;
+            }
+            if (x < 10000) {
+                return n_digits + 3;
+            }
+            x = x / 10000u;
+            n_digits += 4;
+        }
+    }
+
+    /*!
+     * @brief convert a byte to a uppercase hex representation
+     * @param[in] byte byte to represent
+     * @return representation ("00".."FF")
+     */
+    static std::string
+    hex_bytes(std::uint8_t byte) {
+        std::string result                  = "FF";
+        constexpr const char* nibble_to_hex = "0123456789ABCDEF";
+        result[0]                           = nibble_to_hex[byte / 16];
+        result[1]                           = nibble_to_hex[byte % 16];
+        return result;
+    }
+
+    // templates to avoid warnings about useless casts
+    template <typename NumberType, enable_if_t<std::is_signed<NumberType>::value, int> = 0>
+    bool
+    is_negative_number(NumberType x) {
+        return x < 0;
+    }
+
+    template <typename NumberType, enable_if_t<std::is_unsigned<NumberType>::value, int> = 0>
+    bool
+    is_negative_number(NumberType /*unused*/) {
+        return false;
+    }
+
+    /*!
+    @brief dump an integer
+
+    Dump a given integer to output stream @a o. Works internally with
+    @a number_buffer.
+
+    @param[in] x  integer number (signed or unsigned) to dump
+    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
+    */
+    template <
+        typename NumberType,
+        detail::enable_if_t<std::is_integral<NumberType>::value || std::is_same<NumberType, number_unsigned_t>::value ||
+                                std::is_same<NumberType, number_integer_t>::value ||
+                                std::is_same<NumberType, binary_char_t>::value,
+                            int> = 0>
+    void
+    dump_integer(NumberType x) {
+        static constexpr std::array<std::array<char, 2>, 100> digits_to_99{{
+            {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}},
+            {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}}, {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}},
+            {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}}, {{'2', '0'}},
+            {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}},
+            {{'2', '8'}}, {{'2', '9'}}, {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}},
+            {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}}, {{'4', '0'}}, {{'4', '1'}},
+            {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}},
+            {{'4', '9'}}, {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}},
+            {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}}, {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}},
+            {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
+            {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}},
+            {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}}, {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}},
+            {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}}, {{'9', '0'}},
+            {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}},
+            {{'9', '8'}}, {{'9', '9'}},
+        }};
+
+        // special case for "0"
+        if (x == 0) {
+            o->write_character('0');
+            return;
+        }
+
+        // use a pointer to fill the buffer
+        auto buffer_ptr =
+            number_buffer
+                .begin();  // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+
+        number_unsigned_t abs_value;
+
+        unsigned int n_chars{};
+
+        if (is_negative_number(x)) {
+            *buffer_ptr = '-';
+            abs_value   = remove_sign(static_cast<number_integer_t>(x));
+
+            // account one more byte for the minus sign
+            n_chars = 1 + count_digits(abs_value);
+        } else {
+            abs_value = static_cast<number_unsigned_t>(x);
+            n_chars   = count_digits(abs_value);
+        }
+
+        // spare 1 byte for '\0'
+        JSON_ASSERT(n_chars < number_buffer.size() - 1);
+
+        // jump to the end to generate the string from backward,
+        // so we later avoid reversing the result
+        buffer_ptr += n_chars;
+
+        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
+        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
+        while (abs_value >= 100) {
+            const auto digits_index = static_cast<unsigned>((abs_value % 100));
+            abs_value /= 100;
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+
+        if (abs_value >= 10) {
+            const auto digits_index = static_cast<unsigned>(abs_value);
+            *(--buffer_ptr)         = digits_to_99[digits_index][1];
+            *(--buffer_ptr)         = digits_to_99[digits_index][0];
+        } else {
+            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
+        }
+
+        o->write_characters(number_buffer.data(), n_chars);
+    }
+
+    /*!
+    @brief dump a floating-point number
+
+    Dump a given floating-point number to output stream @a o. Works internally
+    with @a number_buffer.
+
+    @param[in] x  floating-point number to dump
+    */
+    void
+    dump_float(number_float_t x) {
+        // NaN / inf
+        if (!std::isfinite(x)) {
+            o->write_characters("null", 4);
+            return;
+        }
+
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double =
+            (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 &&
+             std::numeric_limits<number_float_t>::max_exponent == 128) ||
+            (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 &&
+             std::numeric_limits<number_float_t>::max_exponent == 1024);
+
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void
+    dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/) {
+        auto* begin = number_buffer.data();
+        auto* end   = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void
+    dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/) {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
+
+        // the actual conversion
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
+
+        // negative value indicates an error
+        JSON_ASSERT(len > 0);
+        // check if buffer was large enough
+        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());
+
+        // erase thousands separator
+        if (thousands_sep != '\0') {
+            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::remove returns an iterator, see
+            // https://github.com/nlohmann/json/issues/3081
+            const auto end = std::remove(number_buffer.begin(), number_buffer.begin() + len, thousands_sep);
+            std::fill(end, number_buffer.end(), '\0');
+            JSON_ASSERT((end - number_buffer.begin()) <= len);
+            len = (end - number_buffer.begin());
+        }
+
+        // convert decimal point to '.'
+        if (decimal_point != '\0' && decimal_point != '.') {
+            // NOLINTNEXTLINE(readability-qualified-auto,llvm-qualified-auto): std::find returns an iterator, see
+            // https://github.com/nlohmann/json/issues/3081
+            const auto dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
+            if (dec_pos != number_buffer.end()) {
+                *dec_pos = '.';
+            }
+        }
+
+        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
+
+        // determine if we need to append ".0"
+        const bool value_is_int_like = std::none_of(
+            number_buffer.begin(), number_buffer.begin() + len + 1, [](char c) { return c == '.' || c == 'e'; });
+
+        if (value_is_int_like) {
+            o->write_characters(".0", 2);
+        }
+    }
+
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static std::uint8_t
+    decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept {
+        static const std::array<std::uint8_t, 400> utf8d = {{
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  // 00..1F
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  // 20..3F
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  // 40..5F
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+            0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  // 60..7F
+            1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+            9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,  // 80..9F
+            7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+            7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,  // A0..BF
+            8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+            2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,    // C0..DF
+            0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3,  // E0..EF
+            0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,  // F0..FF
+            0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1,  // s0..s0
+            1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+            1,   0,   1,   1,   1,   1,   1,   0,   1,   0,   1,   1,   1,   1,   1,   1,  // s1..s2
+            1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,   1,   1,   1,   1,   1,
+            1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1,  // s3..s4
+            1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,
+            1,   1,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1,  // s5..s6
+            1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1,
+            1,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1  // s7..s8
+        }};
+
+        JSON_ASSERT(byte < utf8d.size());
+        const std::uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT) ? (byte & 0x3fu) | (codep << 6u) : (0xFFu >> type) & (byte);
+
+        const std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
+        JSON_ASSERT(index < utf8d.size());
+        state = utf8d[index];
+        return state;
+    }
+
+    /*
+     * Overload to make the compiler happy while it is instantiating
+     * dump_integer for number_unsigned_t.
+     * Must never be called.
+     */
+    number_unsigned_t
+    remove_sign(number_unsigned_t x) {
+        JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        return x;            // LCOV_EXCL_LINE
+    }
+
+    /*
+     * Helper function for dump_integer
+     *
+     * This function takes a negative signed integer and returns its absolute
+     * value as unsigned integer. The plus/minus shuffling is necessary as we can
+     * not directly remove the sign of an arbitrary signed integer as the
+     * absolute values of INT_MIN and INT_MAX are usually not the same. See
+     * #1708 for details.
+     */
+    inline number_unsigned_t
+    remove_sign(number_integer_t x) noexcept {
+        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)());  // NOLINT(misc-redundant-expression)
+        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
+    }
+
+   private:
+    /// the output of the serializer
+    output_adapter_t<char> o = nullptr;
+
+    /// a (hopefully) large enough character buffer
+    std::array<char, 64> number_buffer{{}};
+
+    /// the locale
+    const std::lconv* loc = nullptr;
+    /// the locale's thousand separator character
+    const char thousands_sep = '\0';
+    /// the locale's decimal point character
+    const char decimal_point = '\0';
+
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
+    /// the indentation character
+    const char indent_char;
+    /// the indentation string
+    string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
+};
+
+}  // namespace detail
+NLOHMANN_JSON_NAMESPACE_END
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+
+// #include <nlohmann/ordered_map.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#include <functional>        // equal_to, less
+#include <initializer_list>  // initializer_list
+#include <iterator>          // input_iterator_tag, iterator_traits
+#include <memory>            // allocator
+#include <stdexcept>         // for out_of_range
+#include <type_traits>       // enable_if, is_convertible
+#include <utility>           // pair
+#include <vector>            // vector
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/// ordered_map: a minimal map-like container that preserves insertion order
+/// for use within nlohmann::basic_json<ordered_map>
+template <class Key,
+          class T,
+          class IgnoredLess = std::less<Key>,
+          class Allocator   = std::allocator<std::pair<const Key, T>>>
+struct ordered_map : std::vector<std::pair<const Key, T>, Allocator> {
+    using key_type       = Key;
+    using mapped_type    = T;
+    using Container      = std::vector<std::pair<const Key, T>, Allocator>;
+    using iterator       = typename Container::iterator;
+    using const_iterator = typename Container::const_iterator;
+    using size_type      = typename Container::size_type;
+    using value_type     = typename Container::value_type;
+#ifdef JSON_HAS_CPP_14
+    using key_compare = std::equal_to<>;
+#else
+    using key_compare = std::equal_to<Key>;
+#endif
+
+    // Explicit constructors instead of `using Container::Container`
+    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
+    ordered_map() noexcept(noexcept(Container())) : Container{} {}
+    explicit ordered_map(const Allocator& alloc) noexcept(noexcept(Container(alloc))) : Container{alloc} {}
+    template <class It>
+    ordered_map(It first, It last, const Allocator& alloc = Allocator()) : Container{first, last, alloc} {}
+    ordered_map(std::initializer_list<value_type> init, const Allocator& alloc = Allocator())
+        : Container{init, alloc} {}
+
+    std::pair<iterator, bool>
+    emplace(const key_type& key, T&& t) {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(key, std::forward<T>(t));
+        return {std::prev(this->end()), true};
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    std::pair<iterator, bool>
+    emplace(KeyType&& key, T&& t) {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(std::forward<KeyType>(key), std::forward<T>(t));
+        return {std::prev(this->end()), true};
+    }
+
+    T&
+    operator[](const key_type& key) {
+        return emplace(key, T{}).first->second;
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    T&
+    operator[](KeyType&& key) {
+        return emplace(std::forward<KeyType>(key), T{}).first->second;
+    }
+
+    const T&
+    operator[](const key_type& key) const {
+        return at(key);
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    const T&
+    operator[](KeyType&& key) const {
+        return at(std::forward<KeyType>(key));
+    }
+
+    T&
+    at(const key_type& key) {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    T&
+    at(KeyType&& key)  // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    const T&
+    at(const key_type& key) const {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    const T&
+    at(KeyType&& key) const  // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    size_type
+    erase(const key_type& key) {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it) {
+                    it->~value_type();  // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    size_type
+    erase(KeyType&& key)  // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it) {
+                    it->~value_type();  // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator
+    erase(iterator pos) {
+        return erase(pos, std::next(pos));
+    }
+
+    iterator
+    erase(iterator first, iterator last) {
+        if (first == last) {
+            return first;
+        }
+
+        const auto elements_affected = std::distance(first, last);
+        const auto offset            = std::distance(Container::begin(), first);
+
+        // This is the start situation. We need to delete elements_affected
+        // elements (3 in this example: e, f, g), and need to return an
+        // iterator past the last deleted element (h in this example).
+        // Note that offset is the distance from the start of the vector
+        // to first. We will need this later.
+
+        // [ a, b, c, d, e, f, g, h, i, j ]
+        //               ^        ^
+        //             first    last
+
+        // Since we cannot move const Keys, we re-construct them in place.
+        // We start at first and re-construct (viz. copy) the elements from
+        // the back of the vector. Example for first iteration:
+
+        //               ,--------.
+        //               v        |   destroy e and re-construct with h
+        // [ a, b, c, d, e, f, g, h, i, j ]
+        //               ^        ^
+        //               it       it + elements_affected
+
+        for (auto it = first; std::next(it, elements_affected) != Container::end(); ++it) {
+            it->~value_type();                                                    // destroy but keep allocation
+            new (&*it) value_type{std::move(*std::next(it, elements_affected))};  // "move" next element to it
+        }
+
+        // [ a, b, c, d, h, i, j, h, i, j ]
+        //               ^        ^
+        //             first    last
+
+        // remove the unneeded elements at the end of the vector
+        Container::resize(this->size() - static_cast<size_type>(elements_affected));
+
+        // [ a, b, c, d, h, i, j ]
+        //               ^        ^
+        //             first    last
+
+        // first is now pointing past the last deleted element, but we cannot
+        // use this iterator, because it may have been invalidated by the
+        // resize call. Instead, we can return begin() + offset.
+        return Container::begin() + offset;
+    }
+
+    size_type
+    count(const key_type& key) const {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    size_type
+    count(KeyType&& key) const  // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator
+    find(const key_type& key) {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_key_type<key_compare, key_type, KeyType>::value, int> = 0>
+    iterator
+    find(KeyType&& key)  // NOLINT(cppcoreguidelines-missing-std-forward)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    const_iterator
+    find(const key_type& key) const {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, key)) {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    std::pair<iterator, bool>
+    insert(value_type&& value) {
+        return emplace(value.first, std::move(value.second));
+    }
+
+    std::pair<iterator, bool>
+    insert(const value_type& value) {
+        for (auto it = this->begin(); it != this->end(); ++it) {
+            if (m_compare(it->first, value.first)) {
+                return {it, false};
+            }
+        }
+        Container::push_back(value);
+        return {--this->end(), true};
+    }
+
+    template <typename InputIt>
+    using require_input_iter =
+        typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
+                                                    std::input_iterator_tag>::value>::type;
+
+    template <typename InputIt, typename = require_input_iter<InputIt>>
+    void
+    insert(InputIt first, InputIt last) {
+        for (auto it = first; it != last; ++it) {
+            insert(*it);
+        }
+    }
+
+   private:
+    JSON_NO_UNIQUE_ADDRESS key_compare m_compare = key_compare();
+};
+
+NLOHMANN_JSON_NAMESPACE_END
+
+#if defined(JSON_HAS_CPP_17)
+#if JSON_HAS_STATIC_RTTI
+#include <any>
+#endif
+#include <string_view>
+#endif
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+NLOHMANN_JSON_NAMESPACE_BEGIN
+
+/*!
+@brief a class to store JSON values
+
+@internal
+@invariant The member variables @a m_value and @a m_type have the following
+relationship:
+- If `m_type == value_t::object`, then `m_value.object != nullptr`.
+- If `m_type == value_t::array`, then `m_value.array != nullptr`.
+- If `m_type == value_t::string`, then `m_value.string != nullptr`.
+The invariants are checked by member function assert_invariant().
+
+@note ObjectType trick from https://stackoverflow.com/a/9860911
+@endinternal
+
+@since version 1.0.0
+
+@nosubgrouping
+*/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+class basic_json  // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
+    : public ::nlohmann::detail::json_base_class<CustomBaseClass> {
+   private:
+    template <detail::value_t>
+    friend struct detail::external_constructor;
+
+    template <typename>
+    friend class ::nlohmann::json_pointer;
+    // can be restored when json_pointer backwards compatibility is removed
+    // friend ::nlohmann::json_pointer<StringType>;
+
+    template <typename BasicJsonType, typename InputType>
+    friend class ::nlohmann::detail::parser;
+    friend ::nlohmann::detail::serializer<basic_json>;
+    template <typename BasicJsonType>
+    friend class ::nlohmann::detail::iter_impl;
+    template <typename BasicJsonType, typename CharType>
+    friend class ::nlohmann::detail::binary_writer;
+    template <typename BasicJsonType, typename InputType, typename SAX>
+    friend class ::nlohmann::detail::binary_reader;
+    template <typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template <typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
+    friend class ::nlohmann::detail::exception;
+
+    /// workaround type for MSVC
+    using basic_json_t      = NLOHMANN_BASIC_JSON_TPL;
+    using json_base_class_t = ::nlohmann::detail::json_base_class<CustomBaseClass>;
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        // convenience aliases for types residing in namespace detail;
+        using lexer = ::nlohmann::detail::lexer_base<basic_json>;
+
+    template <typename InputAdapterType>
+    static ::nlohmann::detail::parser<basic_json, InputAdapterType>
+    parser(InputAdapterType adapter,
+           detail::parser_callback_t<basic_json> cb = nullptr,
+           const bool allow_exceptions              = true,
+           const bool ignore_comments               = false) {
+        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(
+            std::move(adapter), std::move(cb), allow_exceptions, ignore_comments);
+    }
+
+   private:
+    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
+    template <typename BasicJsonType>
+    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
+    template <typename BasicJsonType>
+    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
+    template <typename Iterator>
+    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
+    template <typename Base>
+    using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
+
+    template <typename CharType>
+    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
+
+    template <typename InputType>
+    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
+    template <typename CharType>
+    using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
+
+    JSON_PRIVATE_UNLESS_TESTED : using serializer = ::nlohmann::detail::serializer<basic_json>;
+
+   public:
+    using value_t = detail::value_t;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<StringType>;
+    template <typename T, typename SFINAE>
+    using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
+    /// how to treat CBOR tags
+    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
+    /// helper type for initializer lists of basic_json values
+    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
+
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
+    ////////////////
+    // exceptions //
+    ////////////////
+
+    /// @name exceptions
+    /// Classes to implement user-defined exceptions.
+    /// @{
+
+    using exception        = detail::exception;
+    using parse_error      = detail::parse_error;
+    using invalid_iterator = detail::invalid_iterator;
+    using type_error       = detail::type_error;
+    using out_of_range     = detail::out_of_range;
+    using other_error      = detail::other_error;
+
+    /// @}
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// The canonic container types to use @ref basic_json like any other STL
+    /// container.
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    /// an iterator for a basic_json container
+    using iterator = iter_impl<basic_json>;
+    /// a const iterator for a basic_json container
+    using const_iterator = iter_impl<const basic_json>;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+    /// @brief returns the allocator associated with the container
+    /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/
+    static allocator_type
+    get_allocator() {
+        return allocator_type();
+    }
+
+    /// @brief returns version information on the library
+    /// @sa https://json.nlohmann.me/api/basic_json/meta/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    meta() {
+        basic_json result;
+
+        result["copyright"]         = "(C) 2013-2023 Niels Lohmann";
+        result["name"]              = "JSON for Modern C++";
+        result["url"]               = "https://github.com/nlohmann/json";
+        result["version"]["string"] = detail::concat(std::to_string(NLOHMANN_JSON_VERSION_MAJOR),
+                                                     '.',
+                                                     std::to_string(NLOHMANN_JSON_VERSION_MINOR),
+                                                     '.',
+                                                     std::to_string(NLOHMANN_JSON_VERSION_PATCH));
+        result["version"]["major"]  = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"]  = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"]  = NLOHMANN_JSON_VERSION_PATCH;
+
+#ifdef _WIN32
+        result["platform"] = "win32";
+#elif defined __linux__
+        result["platform"] = "linux";
+#elif defined __APPLE__
+        result["platform"] = "apple";
+#elif defined __unix__
+        result["platform"] = "unix";
+#else
+        result["platform"] = "unknown";
+#endif
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+        result["compiler"] = {{"family", "icc"}, { "version", __INTEL_COMPILER }};
+#elif defined(__clang__)
+        result["compiler"] = {{"family", "clang"}, { "version", __clang_version__ }};
+#elif defined(__GNUC__) || defined(__GNUG__)
+        result["compiler"] = {{"family", "gcc"},
+                              { "version",
+                                detail::concat(std::to_string(__GNUC__),
+                                               '.',
+                                               std::to_string(__GNUC_MINOR__),
+                                               '.',
+                                               std::to_string(__GNUC_PATCHLEVEL__)) }};
+#elif defined(__HP_cc) || defined(__HP_aCC)
+        result["compiler"] = "hp"
+#elif defined(__IBMCPP__)
+        result["compiler"] = {{"family", "ilecpp"}, { "version", __IBMCPP__ }};
+#elif defined(_MSC_VER)
+        result["compiler"] = {{"family", "msvc"}, { "version", _MSC_VER }};
+#elif defined(__PGI)
+        result["compiler"] = {{"family", "pgcpp"}, { "version", __PGI }};
+#elif defined(__SUNPRO_CC)
+        result["compiler"] = {{"family", "sunpro"}, { "version", __SUNPRO_CC }};
+#else
+        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
+#endif
+
+#if defined(_MSVC_LANG)
+        result["compiler"]["c++"] = std::to_string(_MSVC_LANG);
+#elif defined(__cplusplus)
+        result["compiler"]["c++"] = std::to_string(__cplusplus);
+#else
+        result["compiler"]["c++"] = "unknown";
+#endif
+        return result;
+    }
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// The data types to store a JSON value. These types are derived from
+    /// the template arguments passed to class @ref basic_json.
+    /// @{
+
+    /// @brief default object key comparator type
+    /// The actual object key comparator type (@ref object_comparator_t) may be
+    /// different.
+    /// @sa https://json.nlohmann.me/api/basic_json/default_object_comparator_t/
+#if defined(JSON_HAS_CPP_14)
+    // use of transparent comparator avoids unnecessary repeated construction of temporaries
+    // in functions involving lookup by key with types other than object_t::key_type (aka. StringType)
+    using default_object_comparator_t = std::less<>;
+#else
+    using default_object_comparator_t = std::less<StringType>;
+#endif
+
+    /// @brief a type for an object
+    /// @sa https://json.nlohmann.me/api/basic_json/object_t/
+    using object_t = ObjectType<StringType,
+                                basic_json,
+                                default_object_comparator_t,
+                                AllocatorType<std::pair<const StringType, basic_json>>>;
+
+    /// @brief a type for an array
+    /// @sa https://json.nlohmann.me/api/basic_json/array_t/
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /// @brief a type for a string
+    /// @sa https://json.nlohmann.me/api/basic_json/string_t/
+    using string_t = StringType;
+
+    /// @brief a type for a boolean
+    /// @sa https://json.nlohmann.me/api/basic_json/boolean_t/
+    using boolean_t = BooleanType;
+
+    /// @brief a type for a number (integer)
+    /// @sa https://json.nlohmann.me/api/basic_json/number_integer_t/
+    using number_integer_t = NumberIntegerType;
+
+    /// @brief a type for a number (unsigned)
+    /// @sa https://json.nlohmann.me/api/basic_json/number_unsigned_t/
+    using number_unsigned_t = NumberUnsignedType;
+
+    /// @brief a type for a number (floating-point)
+    /// @sa https://json.nlohmann.me/api/basic_json/number_float_t/
+    using number_float_t = NumberFloatType;
+
+    /// @brief a type for a packed binary type
+    /// @sa https://json.nlohmann.me/api/basic_json/binary_t/
+    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
+
+    /// @brief object key comparator type
+    /// @sa https://json.nlohmann.me/api/basic_json/object_comparator_t/
+    using object_comparator_t = detail::actual_object_comparator_t<basic_json>;
+
+    /// @}
+
+   private:
+    /// helper for exception-safe object creation
+    template <typename T, typename... Args>
+    JSON_HEDLEY_RETURNS_NON_NULL static T*
+    create(Args&&... args) {
+        AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
+        auto deleter = [&](T* obj) { AllocatorTraits::deallocate(alloc, obj, 1); };
+        std::unique_ptr<T, decltype(deleter)> obj(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, obj.get(), std::forward<Args>(args)...);
+        JSON_ASSERT(obj != nullptr);
+        return obj.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        /*!
+        @brief a JSON value
+
+        The actual storage for a JSON value of the @ref basic_json class. This
+        union combines the different storage types for the JSON value types
+        defined in @ref value_t.
+
+        JSON type | value_t type    | used type
+        --------- | --------------- | ------------------------
+        object    | object          | pointer to @ref object_t
+        array     | array           | pointer to @ref array_t
+        string    | string          | pointer to @ref string_t
+        boolean   | boolean         | @ref boolean_t
+        number    | number_integer  | @ref number_integer_t
+        number    | number_unsigned | @ref number_unsigned_t
+        number    | number_float    | @ref number_float_t
+        binary    | binary          | pointer to @ref binary_t
+        null      | null            | *no value is stored*
+
+        @note Variable-length types (objects, arrays, and strings) are stored as
+        pointers. The size of the union should not exceed 64 bits if the default
+        value types are used.
+
+        @since version 1.0.0
+        */
+        union json_value {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// binary (stored with pointer to save storage)
+        binary_t* binary;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (unsigned integer)
+        number_unsigned_t number_unsigned;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (unsigned)
+        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t) {
+            switch (t) {
+                case value_t::object: {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case value_t::array: {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case value_t::string: {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case value_t::binary: {
+                    binary = create<binary_t>();
+                    break;
+                }
+
+                case value_t::boolean: {
+                    boolean = static_cast<boolean_t>(false);
+                    break;
+                }
+
+                case value_t::number_integer: {
+                    number_integer = static_cast<number_integer_t>(0);
+                    break;
+                }
+
+                case value_t::number_unsigned: {
+                    number_unsigned = static_cast<number_unsigned_t>(0);
+                    break;
+                }
+
+                case value_t::number_float: {
+                    number_float = static_cast<number_float_t>(0.0);
+                    break;
+                }
+
+                case value_t::null: {
+                    object = nullptr;  // silence warning, see #821
+                    break;
+                }
+
+                case value_t::discarded:
+                default: {
+                    object = nullptr;  // silence warning, see #821
+                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null)) {
+                        JSON_THROW(other_error::create(
+                            500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr));  // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value) : string(create<string_t>(value)) {}
+
+        /// constructor for rvalue strings
+        json_value(string_t&& value) : string(create<string_t>(std::move(value))) {}
+
+        /// constructor for objects
+        json_value(const object_t& value) : object(create<object_t>(value)) {}
+
+        /// constructor for rvalue objects
+        json_value(object_t&& value) : object(create<object_t>(std::move(value))) {}
+
+        /// constructor for arrays
+        json_value(const array_t& value) : array(create<array_t>(value)) {}
+
+        /// constructor for rvalue arrays
+        json_value(array_t&& value) : array(create<array_t>(std::move(value))) {}
+
+        /// constructor for binary arrays
+        json_value(const typename binary_t::container_type& value) : binary(create<binary_t>(value)) {}
+
+        /// constructor for rvalue binary arrays
+        json_value(typename binary_t::container_type&& value) : binary(create<binary_t>(std::move(value))) {}
+
+        /// constructor for binary arrays (internal type)
+        json_value(const binary_t& value) : binary(create<binary_t>(value)) {}
+
+        /// constructor for rvalue binary arrays (internal type)
+        json_value(binary_t&& value) : binary(create<binary_t>(std::move(value))) {}
+
+        void
+        destroy(value_t t) {
+            if ((t == value_t::object && object == nullptr) || (t == value_t::array && array == nullptr) ||
+                (t == value_t::string && string == nullptr) || (t == value_t::binary && binary == nullptr)) {
+                // not initialized (e.g. due to exception in the ctor)
+                return;
+            }
+            if (t == value_t::array || t == value_t::object) {
+                // flatten the current json_value to a heap-allocated stack
+                std::vector<basic_json> stack;
+
+                // move the top-level items to stack
+                if (t == value_t::array) {
+                    stack.reserve(array->size());
+                    std::move(array->begin(), array->end(), std::back_inserter(stack));
+                } else {
+                    stack.reserve(object->size());
+                    for (auto&& it : *object) {
+                        stack.push_back(std::move(it.second));
+                    }
+                }
+
+                while (!stack.empty()) {
+                    // move the last item to local variable to be processed
+                    basic_json current_item(std::move(stack.back()));
+                    stack.pop_back();
+
+                    // if current_item is array/object, move
+                    // its children to the stack to be processed later
+                    if (current_item.is_array()) {
+                        std::move(current_item.m_data.m_value.array->begin(),
+                                  current_item.m_data.m_value.array->end(),
+                                  std::back_inserter(stack));
+
+                        current_item.m_data.m_value.array->clear();
+                    } else if (current_item.is_object()) {
+                        for (auto&& it : *current_item.m_data.m_value.object) {
+                            stack.push_back(std::move(it.second));
+                        }
+
+                        current_item.m_data.m_value.object->clear();
+                    }
+
+                    // it's now safe that current_item get destructed
+                    // since it doesn't have any children
+                }
+            }
+
+            switch (t) {
+                case value_t::object: {
+                    AllocatorType<object_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
+                    break;
+                }
+
+                case value_t::array: {
+                    AllocatorType<array_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
+                    break;
+                }
+
+                case value_t::string: {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
+                    break;
+                }
+
+                case value_t::binary: {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
+                    break;
+                }
+
+                case value_t::null:
+                case value_t::boolean:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                case value_t::discarded:
+                default: {
+                    break;
+                }
+            }
+        }
+    };
+
+   private:
+    /*!
+    @brief checks the class invariants
+
+    This function asserts the class invariants. It needs to be called at the
+    end of every constructor to make sure that created objects respect the
+    invariant. Furthermore, it has to be called each time the type of a JSON
+    value is changed, because the invariant expresses a relationship between
+    @a m_type and @a m_value.
+
+    Furthermore, the parent relation is checked for arrays and objects: If
+    @a check_parents true and the value is an array or object, then the
+    container's elements must have the current value as parent.
+
+    @param[in] check_parents  whether the parent relation should be checked.
+               The value is true by default and should only be set to false
+               during destruction of objects when the invariant does not
+               need to hold.
+    */
+    void
+    assert_invariant(bool check_parents = true) const noexcept {
+        JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr);
+        JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr);
+
+#if JSON_DIAGNOSTICS
+        JSON_TRY {
+            // cppcheck-suppress assertWithSideEffect
+            JSON_ASSERT(!check_parents || !is_structured() ||
+                        std::all_of(begin(), end(), [this](const basic_json& j) { return j.m_parent == this; }));
+        }
+        JSON_CATCH(...) {}  // LCOV_EXCL_LINE
+#endif
+        static_cast<void>(check_parents);
+    }
+
+    void
+    set_parents() {
+#if JSON_DIAGNOSTICS
+        switch (m_data.m_type) {
+            case value_t::array: {
+                for (auto& element : *m_data.m_value.array) {
+                    element.m_parent = this;
+                }
+                break;
+            }
+
+            case value_t::object: {
+                for (auto& element : *m_data.m_value.object) {
+                    element.second.m_parent = this;
+                }
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                break;
+        }
+#endif
+    }
+
+    iterator
+    set_parents(iterator it, typename iterator::difference_type count_set_parents) {
+#if JSON_DIAGNOSTICS
+        for (typename iterator::difference_type i = 0; i < count_set_parents; ++i) {
+            (it + i)->m_parent = this;
+        }
+#else
+        static_cast<void>(count_set_parents);
+#endif
+        return it;
+    }
+
+    reference
+    set_parent(reference j, std::size_t old_capacity = static_cast<std::size_t>(-1)) {
+#if JSON_DIAGNOSTICS
+        if (old_capacity != static_cast<std::size_t>(-1)) {
+            // see https://github.com/nlohmann/json/issues/2838
+            JSON_ASSERT(type() == value_t::array);
+            if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity)) {
+                // capacity has changed: update all parents
+                set_parents();
+                return j;
+            }
+        }
+
+        // ordered_json uses a vector internally, so pointers could have
+        // been invalidated; see https://github.com/nlohmann/json/issues/2962
+#ifdef JSON_HEDLEY_MSVC_VERSION
+#pragma warning(push)
+#pragma warning(disable : 4127)  // ignore warning to replace if with if constexpr
+#endif
+        if (detail::is_ordered_map<object_t>::value) {
+            set_parents();
+            return j;
+        }
+#ifdef JSON_HEDLEY_MSVC_VERSION
+#pragma warning(pop)
+#endif
+
+        j.m_parent = this;
+#else
+        static_cast<void>(j);
+        static_cast<void>(old_capacity);
+#endif
+        return j;
+    }
+
+   public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /// @brief parser event types
+    /// @sa https://json.nlohmann.me/api/basic_json/parse_event_t/
+    using parse_event_t = detail::parse_event_t;
+
+    /// @brief per-element parser callback type
+    /// @sa https://json.nlohmann.me/api/basic_json/parser_callback_t/
+    using parser_callback_t = detail::parser_callback_t<basic_json>;
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /// @name constructors and destructors
+    /// Constructors of class @ref basic_json, copy/move constructor, copy
+    /// assignment, static functions creating objects, and the destructor.
+    /// @{
+
+    /// @brief create an empty value with a given type
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(const value_t v) : m_data(v) { assert_invariant(); }
+
+    /// @brief create a null object
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(std::nullptr_t = nullptr) noexcept  // NOLINT(bugprone-exception-escape)
+        : basic_json(value_t::null) {
+        assert_invariant();
+    }
+
+    /// @brief create a JSON value from compatible types
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    template <
+        typename CompatibleType,
+        typename U               = detail::uncvref_t<CompatibleType>,
+        detail::enable_if_t<!detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value,
+                            int> = 0>
+    basic_json(CompatibleType&& val) noexcept(
+        noexcept(  // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
+            JSONSerializer<U>::to_json(std::declval<basic_json_t&>(), std::forward<CompatibleType>(val)))) {
+        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief create a JSON value from an existing one
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    template <typename BasicJsonType,
+              detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value &&
+                                      !std::is_same<basic_json, BasicJsonType>::value,
+                                  int> = 0>
+    basic_json(const BasicJsonType& val) {
+        using other_boolean_t         = typename BasicJsonType::boolean_t;
+        using other_number_float_t    = typename BasicJsonType::number_float_t;
+        using other_number_integer_t  = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t          = typename BasicJsonType::string_t;
+        using other_object_t          = typename BasicJsonType::object_t;
+        using other_array_t           = typename BasicJsonType::array_t;
+        using other_binary_t          = typename BasicJsonType::binary_t;
+
+        switch (val.type()) {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::binary:
+                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_data.m_type = value_t::discarded;
+                break;
+            default:                 // LCOV_EXCL_LINE
+                JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+        JSON_ASSERT(m_data.m_type == val.type());
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief create a container (array or object) from an initializer list
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(initializer_list_t init, bool type_deduction = true, value_t manual_type = value_t::array) {
+        // check if each element is an array with two elements whose first
+        // element is a string
+        bool is_an_object = std::all_of(init.begin(), init.end(), [](const detail::json_ref<basic_json>& element_ref) {
+            // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int;
+            // (many string types can be constructed from 0 via its null-pointer guise, so we get a
+            // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows)
+            return element_ref->is_array() && element_ref->size() == 2 &&
+                   (*element_ref)[static_cast<size_type>(0)].is_string();
+        });
+
+        // adjust type if type deduction is not wanted
+        if (!type_deduction) {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array) {
+                is_an_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object)) {
+                JSON_THROW(type_error::create(301, "cannot create object from initializer list", nullptr));
+            }
+        }
+
+        if (is_an_object) {
+            // the initializer list is a list of pairs -> create object
+            m_data.m_type  = value_t::object;
+            m_data.m_value = value_t::object;
+
+            for (auto& element_ref : init) {
+                auto element = element_ref.moved_or_copied();
+                m_data.m_value.object->emplace(std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)),
+                                               std::move((*element.m_data.m_value.array)[1]));
+            }
+        } else {
+            // the initializer list describes an array -> create array
+            m_data.m_type        = value_t::array;
+            m_data.m_value.array = create<array_t>(init.begin(), init.end());
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief explicitly create a binary array (without subtype)
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    binary(const typename binary_t::container_type& init) {
+        auto res           = basic_json();
+        res.m_data.m_type  = value_t::binary;
+        res.m_data.m_value = init;
+        return res;
+    }
+
+    /// @brief explicitly create a binary array (with subtype)
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype) {
+        auto res           = basic_json();
+        res.m_data.m_type  = value_t::binary;
+        res.m_data.m_value = binary_t(init, subtype);
+        return res;
+    }
+
+    /// @brief explicitly create a binary array
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    binary(typename binary_t::container_type&& init) {
+        auto res           = basic_json();
+        res.m_data.m_type  = value_t::binary;
+        res.m_data.m_value = std::move(init);
+        return res;
+    }
+
+    /// @brief explicitly create a binary array (with subtype)
+    /// @sa https://json.nlohmann.me/api/basic_json/binary/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype) {
+        auto res           = basic_json();
+        res.m_data.m_type  = value_t::binary;
+        res.m_data.m_value = binary_t(std::move(init), subtype);
+        return res;
+    }
+
+    /// @brief explicitly create an array from an initializer list
+    /// @sa https://json.nlohmann.me/api/basic_json/array/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    array(initializer_list_t init = {}) {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /// @brief explicitly create an object from an initializer list
+    /// @sa https://json.nlohmann.me/api/basic_json/object/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    object(initializer_list_t init = {}) {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /// @brief construct an array with count copies of given value
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(size_type cnt, const basic_json& val) : m_data{cnt, val} {
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief construct a JSON container given an iterator range
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    template <class InputIT,
+              typename std::enable_if<std::is_same<InputIT, typename basic_json_t::iterator>::value ||
+                                          std::is_same<InputIT, typename basic_json_t::const_iterator>::value,
+                                      int>::type = 0>
+    basic_json(InputIT first, InputIT last) {
+        JSON_ASSERT(first.m_object != nullptr);
+        JSON_ASSERT(last.m_object != nullptr);
+
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) {
+            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", nullptr));
+        }
+
+        // copy type from first iterator
+        m_data.m_type = first.m_object->m_data.m_type;
+
+        // check if iterator range is complete for primitive values
+        switch (m_data.m_type) {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string: {
+                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin() ||
+                                         !last.m_it.primitive_iterator.is_end())) {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", first.m_object));
+                }
+                break;
+            }
+
+            case value_t::null:
+            case value_t::object:
+            case value_t::array:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                break;
+        }
+
+        switch (m_data.m_type) {
+            case value_t::number_integer: {
+                m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned: {
+                m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float: {
+                m_data.m_value.number_float = first.m_object->m_data.m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean: {
+                m_data.m_value.boolean = first.m_object->m_data.m_value.boolean;
+                break;
+            }
+
+            case value_t::string: {
+                m_data.m_value = *first.m_object->m_data.m_value.string;
+                break;
+            }
+
+            case value_t::object: {
+                m_data.m_value.object = create<object_t>(first.m_it.object_iterator, last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array: {
+                m_data.m_value.array = create<array_t>(first.m_it.array_iterator, last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::binary: {
+                m_data.m_value = *first.m_object->m_data.m_value.binary;
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(invalid_iterator::create(
+                    206,
+                    detail::concat("cannot construct with iterators from ", first.m_object->type_name()),
+                    first.m_object));
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    template <typename JsonRef,
+              detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
+                                                      std::is_same<typename JsonRef::value_type, basic_json>>::value,
+                                  int> = 0>
+    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
+
+    /// @brief copy constructor
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(const basic_json& other) : json_base_class_t(other) {
+        m_data.m_type = other.m_data.m_type;
+        // check of passed value is valid
+        other.assert_invariant();
+
+        switch (m_data.m_type) {
+            case value_t::object: {
+                m_data.m_value = *other.m_data.m_value.object;
+                break;
+            }
+
+            case value_t::array: {
+                m_data.m_value = *other.m_data.m_value.array;
+                break;
+            }
+
+            case value_t::string: {
+                m_data.m_value = *other.m_data.m_value.string;
+                break;
+            }
+
+            case value_t::boolean: {
+                m_data.m_value = other.m_data.m_value.boolean;
+                break;
+            }
+
+            case value_t::number_integer: {
+                m_data.m_value = other.m_data.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned: {
+                m_data.m_value = other.m_data.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float: {
+                m_data.m_value = other.m_data.m_value.number_float;
+                break;
+            }
+
+            case value_t::binary: {
+                m_data.m_value = *other.m_data.m_value.binary;
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                break;
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief move constructor
+    /// @sa https://json.nlohmann.me/api/basic_json/basic_json/
+    basic_json(basic_json&& other) noexcept
+        : json_base_class_t(std::forward<json_base_class_t>(other)), m_data(std::move(other.m_data)) {
+        // check that passed value is valid
+        other.assert_invariant(false);
+
+        // invalidate payload
+        other.m_data.m_type  = value_t::null;
+        other.m_data.m_value = {};
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /// @brief copy assignment
+    /// @sa https://json.nlohmann.me/api/basic_json/operator=/
+    basic_json&
+    operator=(basic_json other) noexcept(std::is_nothrow_move_constructible<value_t>::value &&
+                                         std::is_nothrow_move_assignable<value_t>::value &&
+                                         std::is_nothrow_move_constructible<json_value>::value &&
+                                         std::is_nothrow_move_assignable<json_value>::value &&
+                                         std::is_nothrow_move_assignable<json_base_class_t>::value) {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        using std::swap;
+        swap(m_data.m_type, other.m_data.m_type);
+        swap(m_data.m_value, other.m_data.m_value);
+        json_base_class_t::operator=(std::move(other));
+
+        set_parents();
+        assert_invariant();
+        return *this;
+    }
+
+    /// @brief destructor
+    /// @sa https://json.nlohmann.me/api/basic_json/~basic_json/
+    ~basic_json() noexcept { assert_invariant(false); }
+
+    /// @}
+
+   public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// Functions to inspect the type of a JSON value.
+    /// @{
+
+    /// @brief serialization
+    /// @sa https://json.nlohmann.me/api/basic_json/dump/
+    string_t
+    dump(const int indent                    = -1,
+         const char indent_char              = ' ',
+         const bool ensure_ascii             = false,
+         const error_handler_t error_handler = error_handler_t::strict) const {
+        string_t result;
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
+
+        if (indent >= 0) {
+            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
+        } else {
+            s.dump(*this, false, ensure_ascii, 0);
+        }
+
+        return result;
+    }
+
+    /// @brief return the type of the JSON value (explicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/type/
+    constexpr value_t
+    type() const noexcept {
+        return m_data.m_type;
+    }
+
+    /// @brief return whether type is primitive
+    /// @sa https://json.nlohmann.me/api/basic_json/is_primitive/
+    constexpr bool
+    is_primitive() const noexcept {
+        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
+    }
+
+    /// @brief return whether type is structured
+    /// @sa https://json.nlohmann.me/api/basic_json/is_structured/
+    constexpr bool
+    is_structured() const noexcept {
+        return is_array() || is_object();
+    }
+
+    /// @brief return whether value is null
+    /// @sa https://json.nlohmann.me/api/basic_json/is_null/
+    constexpr bool
+    is_null() const noexcept {
+        return m_data.m_type == value_t::null;
+    }
+
+    /// @brief return whether value is a boolean
+    /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/
+    constexpr bool
+    is_boolean() const noexcept {
+        return m_data.m_type == value_t::boolean;
+    }
+
+    /// @brief return whether value is a number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number/
+    constexpr bool
+    is_number() const noexcept {
+        return is_number_integer() || is_number_float();
+    }
+
+    /// @brief return whether value is an integer number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/
+    constexpr bool
+    is_number_integer() const noexcept {
+        return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned;
+    }
+
+    /// @brief return whether value is an unsigned integer number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/
+    constexpr bool
+    is_number_unsigned() const noexcept {
+        return m_data.m_type == value_t::number_unsigned;
+    }
+
+    /// @brief return whether value is a floating-point number
+    /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/
+    constexpr bool
+    is_number_float() const noexcept {
+        return m_data.m_type == value_t::number_float;
+    }
+
+    /// @brief return whether value is an object
+    /// @sa https://json.nlohmann.me/api/basic_json/is_object/
+    constexpr bool
+    is_object() const noexcept {
+        return m_data.m_type == value_t::object;
+    }
+
+    /// @brief return whether value is an array
+    /// @sa https://json.nlohmann.me/api/basic_json/is_array/
+    constexpr bool
+    is_array() const noexcept {
+        return m_data.m_type == value_t::array;
+    }
+
+    /// @brief return whether value is a string
+    /// @sa https://json.nlohmann.me/api/basic_json/is_string/
+    constexpr bool
+    is_string() const noexcept {
+        return m_data.m_type == value_t::string;
+    }
+
+    /// @brief return whether value is a binary array
+    /// @sa https://json.nlohmann.me/api/basic_json/is_binary/
+    constexpr bool
+    is_binary() const noexcept {
+        return m_data.m_type == value_t::binary;
+    }
+
+    /// @brief return whether value is discarded
+    /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/
+    constexpr bool
+    is_discarded() const noexcept {
+        return m_data.m_type == value_t::discarded;
+    }
+
+    /// @brief return the type of the JSON value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/
+    constexpr
+    operator value_t() const noexcept {
+        return m_data.m_type;
+    }
+
+    /// @}
+
+   private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get a boolean (explicit)
+    boolean_t
+    get_impl(boolean_t* /*unused*/) const {
+        if (JSON_HEDLEY_LIKELY(is_boolean())) {
+            return m_data.m_value.boolean;
+        }
+
+        JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this));
+    }
+
+    /// get a pointer to the value (object)
+    object_t*
+    get_impl_ptr(object_t* /*unused*/) noexcept {
+        return is_object() ? m_data.m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    constexpr const object_t*
+    get_impl_ptr(const object_t* /*unused*/) const noexcept {
+        return is_object() ? m_data.m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t*
+    get_impl_ptr(array_t* /*unused*/) noexcept {
+        return is_array() ? m_data.m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    constexpr const array_t*
+    get_impl_ptr(const array_t* /*unused*/) const noexcept {
+        return is_array() ? m_data.m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t*
+    get_impl_ptr(string_t* /*unused*/) noexcept {
+        return is_string() ? m_data.m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    constexpr const string_t*
+    get_impl_ptr(const string_t* /*unused*/) const noexcept {
+        return is_string() ? m_data.m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t*
+    get_impl_ptr(boolean_t* /*unused*/) noexcept {
+        return is_boolean() ? &m_data.m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    constexpr const boolean_t*
+    get_impl_ptr(const boolean_t* /*unused*/) const noexcept {
+        return is_boolean() ? &m_data.m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t*
+    get_impl_ptr(number_integer_t* /*unused*/) noexcept {
+        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    constexpr const number_integer_t*
+    get_impl_ptr(const number_integer_t* /*unused*/) const noexcept {
+        return is_number_integer() ? &m_data.m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    number_unsigned_t*
+    get_impl_ptr(number_unsigned_t* /*unused*/) noexcept {
+        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    constexpr const number_unsigned_t*
+    get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept {
+        return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t*
+    get_impl_ptr(number_float_t* /*unused*/) noexcept {
+        return is_number_float() ? &m_data.m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    constexpr const number_float_t*
+    get_impl_ptr(const number_float_t* /*unused*/) const noexcept {
+        return is_number_float() ? &m_data.m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    binary_t*
+    get_impl_ptr(binary_t* /*unused*/) noexcept {
+        return is_binary() ? m_data.m_value.binary : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    constexpr const binary_t*
+    get_impl_ptr(const binary_t* /*unused*/) const noexcept {
+        return is_binary() ? m_data.m_value.binary : nullptr;
+    }
+
+    /*!
+    @brief helper function to implement get_ref()
+
+    This function helps to implement get_ref() without code duplication for
+    const and non-const overloads
+
+    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
+
+    @throw type_error.303 if ReferenceType does not match underlying value
+    type of the current JSON
+    */
+    template <typename ReferenceType, typename ThisType>
+    static ReferenceType
+    get_ref_impl(ThisType& obj) {
+        // delegate the call to get_ptr<>()
+        auto* ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
+
+        if (JSON_HEDLEY_LIKELY(ptr != nullptr)) {
+            return *ptr;
+        }
+
+        JSON_THROW(type_error::create(
+            303, detail::concat("incompatible ReferenceType for get_ref, actual type is ", obj.type_name()), &obj));
+    }
+
+   public:
+    /// @name value access
+    /// Direct access to the stored value of a JSON value.
+    /// @{
+
+    /// @brief get a pointer value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
+    template <typename PointerType, typename std::enable_if<std::is_pointer<PointerType>::value, int>::type = 0>
+    auto
+    get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>())) {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /// @brief get a pointer value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ptr/
+    template <typename PointerType,
+              typename std::enable_if<std::is_pointer<PointerType>::value &&
+                                          std::is_const<typename std::remove_pointer<PointerType>::type>::value,
+                                      int>::type = 0>
+    constexpr auto
+    get_ptr() const noexcept
+        -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>())) {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+   private:
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType ret;
+    JSONSerializer<ValueType>::from_json(*this, ret);
+    return ret;
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+    - @ref json_serializer<ValueType> does not have a `from_json()` method of
+      the form `ValueType from_json(const basic_json&)`
+
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @since version 2.1.0
+    */
+    template <typename ValueType,
+              detail::enable_if_t<detail::is_default_constructible<ValueType>::value &&
+                                      detail::has_from_json<basic_json_t, ValueType>::value,
+                                  int> = 0>
+    ValueType
+    get_impl(detail::priority_tag<0> /*unused*/) const
+        noexcept(noexcept(JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(),
+                                                               std::declval<ValueType&>()))) {
+        auto ret = ValueType();
+        JSONSerializer<ValueType>::from_json(*this, ret);
+        return ret;
+    }
+
+    /*!
+    @brief get a value (explicit); special case
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    return JSONSerializer<ValueType>::from_json(*this);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json and
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `ValueType from_json(const basic_json&)`
+
+    @note If @ref json_serializer<ValueType> has both overloads of
+    `from_json()`, this one is chosen.
+
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @since version 2.1.0
+    */
+    template <typename ValueType,
+              detail::enable_if_t<detail::has_non_default_from_json<basic_json_t, ValueType>::value, int> = 0>
+    ValueType
+    get_impl(detail::priority_tag<1> /*unused*/) const
+        noexcept(noexcept(JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>()))) {
+        return JSONSerializer<ValueType>::from_json(*this);
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @a BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template <typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    BasicJsonType
+    get_impl(detail::priority_tag<2> /*unused*/) const {
+        return *this;
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads avoids a lot of template boilerplate, it can be seen as the
+    identity method
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this
+
+    @complexity Constant.
+
+    @since version 2.1.0
+    */
+    template <typename BasicJsonType, detail::enable_if_t<std::is_same<BasicJsonType, basic_json_t>::value, int> = 0>
+    basic_json
+    get_impl(detail::priority_tag<3> /*unused*/) const {
+        return *this;
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template <typename PointerType, detail::enable_if_t<std::is_pointer<PointerType>::value, int> = 0>
+    constexpr auto
+    get_impl(detail::priority_tag<4> /*unused*/) const noexcept
+        -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>()) {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+   public:
+    /*!
+    @brief get a (pointer) value (explicit)
+
+    Performs explicit type conversion between the JSON value and a compatible value if required.
+
+    - If the requested type is a pointer to the internally stored JSON value that pointer is returned.
+    No copies are made.
+
+    - If the requested type is the current @ref basic_json, or a different @ref basic_json convertible
+    from the current @ref basic_json.
+
+    - Otherwise the value is converted by calling the @ref json_serializer<ValueType> `from_json()`
+    method.
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @tparam ValueType if necessary
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws if conversion is required
+
+    @since version 2.1.0
+    */
+    template <typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>>
+#if defined(JSON_HAS_CPP_14)
+    constexpr
+#endif
+        auto
+        get() const
+        noexcept(noexcept(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4>{})))
+            -> decltype(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4>{})) {
+        // we cannot static_assert on ValueTypeCV being non-const, because
+        // there is support for get<const basic_json_t>(), which is why we
+        // still need the uncvref
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        return get_impl<ValueType>(detail::priority_tag<4>{});
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning The pointer becomes invalid if the underlying JSON object
+    changes.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa see @ref get_ptr() for explicit pointer-member access
+
+    @since version 1.0.0
+    */
+    template <typename PointerType, typename std::enable_if<std::is_pointer<PointerType>::value, int>::type = 0>
+    auto
+    get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>()) {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /// @brief get a value (explicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_to/
+    template <typename ValueType,
+              detail::enable_if_t<!detail::is_basic_json<ValueType>::value &&
+                                      detail::has_from_json<basic_json_t, ValueType>::value,
+                                  int> = 0>
+    ValueType&
+    get_to(ValueType& v) const
+        noexcept(noexcept(JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v))) {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+    // specialization to allow calling get_to with a basic_json value
+    // see https://github.com/nlohmann/json/issues/2175
+    template <typename ValueType, detail::enable_if_t<detail::is_basic_json<ValueType>::value, int> = 0>
+    ValueType&
+    get_to(ValueType& v) const {
+        v = *this;
+        return v;
+    }
+
+    template <typename T,
+              std::size_t N,
+              typename Array =
+                  T (&)[N],  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+              detail::enable_if_t<detail::has_from_json<basic_json_t, Array>::value, int> = 0>
+    Array
+    get_to(T (&v)[N]) const  // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+        noexcept(noexcept(JSONSerializer<Array>::from_json(std::declval<const basic_json_t&>(), v))) {
+        JSONSerializer<Array>::from_json(*this, v);
+        return v;
+    }
+
+    /// @brief get a reference value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
+    template <typename ReferenceType, typename std::enable_if<std::is_reference<ReferenceType>::value, int>::type = 0>
+    ReferenceType
+    get_ref() {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /// @brief get a reference value (implicit)
+    /// @sa https://json.nlohmann.me/api/basic_json/get_ref/
+    template <typename ReferenceType,
+              typename std::enable_if<std::is_reference<ReferenceType>::value &&
+                                          std::is_const<typename std::remove_reference<ReferenceType>::type>::value,
+                                      int>::type = 0>
+    ReferenceType
+    get_ref() const {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implicit type conversion between the JSON value and a compatible value.
+    The call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays. The character type of @ref string_t
+    as well as an initializer list of this type is excluded to avoid
+    ambiguities as these types implicitly convert to `std::string`.
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw type_error.302 in case passed type @a ValueType is incompatible
+    to the JSON value type (e.g., the JSON value is of type boolean, but a
+    string is requested); see example below
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+
+    @since version 1.0.0
+    */
+    template <typename ValueType,
+              typename std::enable_if<
+                  detail::conjunction<
+                      detail::negation<std::is_pointer<ValueType>>,
+                      detail::negation<std::is_same<ValueType, std::nullptr_t>>,
+                      detail::negation<std::is_same<ValueType, detail::json_ref<basic_json>>>,
+                      detail::negation<std::is_same<ValueType, typename string_t::value_type>>,
+                      detail::negation<detail::is_basic_json<ValueType>>,
+                      detail::negation<std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>>,
+#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
+                      detail::negation<std::is_same<ValueType, std::string_view>>,
+#endif
+#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI
+                      detail::negation<std::is_same<ValueType, std::any>>,
+#endif
+                      detail::is_detected_lazy<detail::get_template_function, const basic_json_t&, ValueType>>::value,
+                  int>::type = 0>
+    JSON_EXPLICIT
+    operator ValueType() const {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /// @brief get a binary value
+    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
+    binary_t&
+    get_binary() {
+        if (!is_binary()) {
+            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
+        }
+
+        return *get_ptr<binary_t*>();
+    }
+
+    /// @brief get a binary value
+    /// @sa https://json.nlohmann.me/api/basic_json/get_binary/
+    const binary_t&
+    get_binary() const {
+        if (!is_binary()) {
+            JSON_THROW(type_error::create(302, detail::concat("type must be binary, but is ", type_name()), this));
+        }
+
+        return *get_ptr<const binary_t*>();
+    }
+
+    /// @}
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// Access to the JSON value.
+    /// @{
+
+    /// @brief access specified array element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    reference
+    at(size_type idx) {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            JSON_TRY { return set_parent(m_data.m_value.array->at(idx)); }
+            JSON_CATCH(std::out_of_range&) {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(
+                    401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
+            }
+        } else {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+    }
+
+    /// @brief access specified array element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    const_reference
+    at(size_type idx) const {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            JSON_TRY { return m_data.m_value.array->at(idx); }
+            JSON_CATCH(std::out_of_range&) {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(
+                    401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
+            }
+        } else {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    reference
+    at(const typename object_t::key_type& key) {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(key);
+        if (it == m_data.m_value.object->end()) {
+            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
+        }
+        return set_parent(it->second);
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    reference
+    at(KeyType&& key) {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it == m_data.m_value.object->end()) {
+            JSON_THROW(out_of_range::create(
+                403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
+        }
+        return set_parent(it->second);
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    const_reference
+    at(const typename object_t::key_type& key) const {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(key);
+        if (it == m_data.m_value.object->end()) {
+            JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this));
+        }
+        return it->second;
+    }
+
+    /// @brief access specified object element with bounds checking
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    const_reference
+    at(KeyType&& key) const {
+        // at only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this));
+        }
+
+        auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it == m_data.m_value.object->end()) {
+            JSON_THROW(out_of_range::create(
+                403, detail::concat("key '", string_t(std::forward<KeyType>(key)), "' not found"), this));
+        }
+        return it->second;
+    }
+
+    /// @brief access specified array element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    reference
+    operator[](size_type idx) {
+        // implicitly convert null value to an empty array
+        if (is_null()) {
+            m_data.m_type        = value_t::array;
+            m_data.m_value.array = create<array_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            // fill up array with null values if given idx is outside range
+            if (idx >= m_data.m_value.array->size()) {
+#if JSON_DIAGNOSTICS
+                // remember array size & capacity before resizing
+                const auto old_size     = m_data.m_value.array->size();
+                const auto old_capacity = m_data.m_value.array->capacity();
+#endif
+                m_data.m_value.array->resize(idx + 1);
+
+#if JSON_DIAGNOSTICS
+                if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity)) {
+                    // capacity has changed: update all parents
+                    set_parents();
+                } else {
+                    // set parent for values added above
+                    set_parents(begin() + static_cast<typename iterator::difference_type>(old_size),
+                                static_cast<typename iterator::difference_type>(idx + 1 - old_size));
+                }
+#endif
+                assert_invariant();
+            }
+
+            return m_data.m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(
+            305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
+    }
+
+    /// @brief access specified array element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    const_reference
+    operator[](size_type idx) const {
+        // const operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            return m_data.m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(
+            305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    reference
+    operator[](typename object_t::key_type key) {
+        // implicitly convert null value to an empty object
+        if (is_null()) {
+            m_data.m_type         = value_t::object;
+            m_data.m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            auto result = m_data.m_value.object->emplace(std::move(key), nullptr);
+            return set_parent(result.first->second);
+        }
+
+        JSON_THROW(type_error::create(
+            305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    const_reference
+    operator[](const typename object_t::key_type& key) const {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            auto it = m_data.m_value.object->find(key);
+            JSON_ASSERT(it != m_data.m_value.object->end());
+            return it->second;
+        }
+
+        JSON_THROW(type_error::create(
+            305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+    // these two functions resolve a (const) char * ambiguity affecting Clang and MSVC
+    // (they seemingly cannot be constrained to resolve the ambiguity)
+    template <typename T>
+    reference
+    operator[](T* key) {
+        return operator[](typename object_t::key_type(key));
+    }
+
+    template <typename T>
+    const_reference
+    operator[](T* key) const {
+        return operator[](typename object_t::key_type(key));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    reference
+    operator[](KeyType&& key) {
+        // implicitly convert null value to an empty object
+        if (is_null()) {
+            m_data.m_type         = value_t::object;
+            m_data.m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            auto result = m_data.m_value.object->emplace(std::forward<KeyType>(key), nullptr);
+            return set_parent(result.first->second);
+        }
+
+        JSON_THROW(type_error::create(
+            305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+    /// @brief access specified object element
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    const_reference
+    operator[](KeyType&& key) const {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+            JSON_ASSERT(it != m_data.m_value.object->end());
+            return it->second;
+        }
+
+        JSON_THROW(type_error::create(
+            305, detail::concat("cannot use operator[] with a string argument with ", type_name()), this));
+    }
+
+   private:
+    template <typename KeyType>
+    using is_comparable_with_object_key =
+        detail::is_comparable<object_comparator_t, const typename object_t::key_type&, KeyType>;
+
+    template <typename ValueType>
+    using value_return_type =
+        std::conditional<detail::is_c_string_uncvref<ValueType>::value, string_t, typename std::decay<ValueType>::type>;
+
+   public:
+    /// @brief access specified object element with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template <class ValueType,
+              detail::enable_if_t<!detail::is_transparent<object_comparator_t>::value &&
+                                      detail::is_getable<basic_json_t, ValueType>::value &&
+                                      !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+                                  int> = 0>
+    ValueType
+    value(const typename object_t::key_type& key, const ValueType& default_value) const {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end()) {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template <class ValueType,
+              class ReturnType         = typename value_return_type<ValueType>::type,
+              detail::enable_if_t<!detail::is_transparent<object_comparator_t>::value &&
+                                      detail::is_getable<basic_json_t, ReturnType>::value &&
+                                      !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+                                  int> = 0>
+    ReturnType
+    value(const typename object_t::key_type& key, ValueType&& default_value) const {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end()) {
+                return it->template get<ReturnType>();
+            }
+
+            return std::forward<ValueType>(default_value);
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template <
+        class ValueType,
+        class KeyType,
+        detail::enable_if_t<
+            detail::is_transparent<object_comparator_t>::value && !detail::is_json_pointer<KeyType>::value &&
+                is_comparable_with_object_key<KeyType>::value && detail::is_getable<basic_json_t, ValueType>::value &&
+                !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+            int> = 0>
+    ValueType
+    value(KeyType&& key, const ValueType& default_value) const {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(std::forward<KeyType>(key));
+            if (it != end()) {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element via JSON Pointer with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template <
+        class ValueType,
+        class KeyType,
+        class ReturnType = typename value_return_type<ValueType>::type,
+        detail::enable_if_t<
+            detail::is_transparent<object_comparator_t>::value && !detail::is_json_pointer<KeyType>::value &&
+                is_comparable_with_object_key<KeyType>::value && detail::is_getable<basic_json_t, ReturnType>::value &&
+                !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+            int> = 0>
+    ReturnType
+    value(KeyType&& key, ValueType&& default_value) const {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(std::forward<KeyType>(key));
+            if (it != end()) {
+                return it->template get<ReturnType>();
+            }
+
+            return std::forward<ValueType>(default_value);
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element via JSON Pointer with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template <class ValueType,
+              detail::enable_if_t<detail::is_getable<basic_json_t, ValueType>::value &&
+                                      !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+                                  int> = 0>
+    ValueType
+    value(const json_pointer& ptr, const ValueType& default_value) const {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY { return ptr.get_checked(this).template get<ValueType>(); }
+            JSON_INTERNAL_CATCH(out_of_range&) { return default_value; }
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    /// @brief access specified object element via JSON Pointer with default value
+    /// @sa https://json.nlohmann.me/api/basic_json/value/
+    template <class ValueType,
+              class ReturnType         = typename value_return_type<ValueType>::type,
+              detail::enable_if_t<detail::is_getable<basic_json_t, ReturnType>::value &&
+                                      !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+                                  int> = 0>
+    ReturnType
+    value(const json_pointer& ptr, ValueType&& default_value) const {
+        // value only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY { return ptr.get_checked(this).template get<ReturnType>(); }
+            JSON_INTERNAL_CATCH(out_of_range&) { return std::forward<ValueType>(default_value); }
+        }
+
+        JSON_THROW(type_error::create(306, detail::concat("cannot use value() with ", type_name()), this));
+    }
+
+    template <class ValueType,
+              class BasicJsonType,
+              detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value &&
+                                      detail::is_getable<basic_json_t, ValueType>::value &&
+                                      !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+                                  int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    ValueType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, const ValueType& default_value) const {
+        return value(ptr.convert(), default_value);
+    }
+
+    template <class ValueType,
+              class BasicJsonType,
+              class ReturnType         = typename value_return_type<ValueType>::type,
+              detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value &&
+                                      detail::is_getable<basic_json_t, ReturnType>::value &&
+                                      !std::is_same<value_t, detail::uncvref_t<ValueType>>::value,
+                                  int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    ReturnType value(const ::nlohmann::json_pointer<BasicJsonType>& ptr, ValueType&& default_value) const {
+        return value(ptr.convert(), std::forward<ValueType>(default_value));
+    }
+
+    /// @brief access the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/front/
+    reference
+    front() {
+        return *begin();
+    }
+
+    /// @brief access the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/front/
+    const_reference
+    front() const {
+        return *cbegin();
+    }
+
+    /// @brief access the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/back/
+    reference
+    back() {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /// @brief access the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/back/
+    const_reference
+    back() const {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /// @brief remove element given an iterator
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    template <class IteratorType,
+              detail::enable_if_t<std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                                      std::is_same<IteratorType, typename basic_json_t::const_iterator>::value,
+                                  int> = 0>
+    IteratorType
+    erase(IteratorType pos) {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object)) {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+        }
+
+        IteratorType result = end();
+
+        switch (m_data.m_type) {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary: {
+                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin())) {
+                    JSON_THROW(invalid_iterator::create(205, "iterator out of range", this));
+                }
+
+                if (is_string()) {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
+                    m_data.m_value.string = nullptr;
+                } else if (is_binary()) {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
+                    m_data.m_value.binary = nullptr;
+                }
+
+                m_data.m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object: {
+                result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array: {
+                result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        return result;
+    }
+
+    /// @brief remove elements given an iterator range
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    template <class IteratorType,
+              detail::enable_if_t<std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                                      std::is_same<IteratorType, typename basic_json_t::const_iterator>::value,
+                                  int> = 0>
+    IteratorType
+    erase(IteratorType first, IteratorType last) {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object)) {
+            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", this));
+        }
+
+        IteratorType result = end();
+
+        switch (m_data.m_type) {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary: {
+                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin() ||
+                                       !last.m_it.primitive_iterator.is_end())) {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", this));
+                }
+
+                if (is_string()) {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.string, 1);
+                    m_data.m_value.string = nullptr;
+                } else if (is_binary()) {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_data.m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_data.m_value.binary, 1);
+                    m_data.m_value.binary = nullptr;
+                }
+
+                m_data.m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object: {
+                result.m_it.object_iterator =
+                    m_data.m_value.object->erase(first.m_it.object_iterator, last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array: {
+                result.m_it.array_iterator =
+                    m_data.m_value.array->erase(first.m_it.array_iterator, last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        return result;
+    }
+
+   private:
+    template <typename KeyType,
+              detail::enable_if_t<detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int> = 0>
+    size_type
+    erase_internal(KeyType&& key) {
+        // this erase only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        return m_data.m_value.object->erase(std::forward<KeyType>(key));
+    }
+
+    template <typename KeyType,
+              detail::enable_if_t<!detail::has_erase_with_key_type<basic_json_t, KeyType>::value, int> = 0>
+    size_type
+    erase_internal(KeyType&& key) {
+        // this erase only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+
+        const auto it = m_data.m_value.object->find(std::forward<KeyType>(key));
+        if (it != m_data.m_value.object->end()) {
+            m_data.m_value.object->erase(it);
+            return 1;
+        }
+        return 0;
+    }
+
+   public:
+    /// @brief remove element from a JSON object given a key
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    size_type
+    erase(const typename object_t::key_type& key) {
+        // the indirection via erase_internal() is added to avoid making this
+        // function a template and thus de-rank it during overload resolution
+        return erase_internal(key);
+    }
+
+    /// @brief remove element from a JSON object given a key
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    size_type
+    erase(KeyType&& key) {
+        return erase_internal(std::forward<KeyType>(key));
+    }
+
+    /// @brief remove element from a JSON array given an index
+    /// @sa https://json.nlohmann.me/api/basic_json/erase/
+    void
+    erase(const size_type idx) {
+        // this erase only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            if (JSON_HEDLEY_UNLIKELY(idx >= size())) {
+                JSON_THROW(out_of_range::create(
+                    401, detail::concat("array index ", std::to_string(idx), " is out of range"), this));
+            }
+
+            m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast<difference_type>(idx));
+        } else {
+            JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this));
+        }
+    }
+
+    /// @}
+
+    ////////////
+    // lookup //
+    ////////////
+
+    /// @name lookup
+    /// @{
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    iterator
+    find(const typename object_t::key_type& key) {
+        auto result = end();
+
+        if (is_object()) {
+            result.m_it.object_iterator = m_data.m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    const_iterator
+    find(const typename object_t::key_type& key) const {
+        auto result = cend();
+
+        if (is_object()) {
+            result.m_it.object_iterator = m_data.m_value.object->find(key);
+        }
+
+        return result;
+    }
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    iterator
+    find(KeyType&& key) {
+        auto result = end();
+
+        if (is_object()) {
+            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
+        }
+
+        return result;
+    }
+
+    /// @brief find an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/find/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    const_iterator
+    find(KeyType&& key) const {
+        auto result = cend();
+
+        if (is_object()) {
+            result.m_it.object_iterator = m_data.m_value.object->find(std::forward<KeyType>(key));
+        }
+
+        return result;
+    }
+
+    /// @brief returns the number of occurrences of a key in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/count/
+    size_type
+    count(const typename object_t::key_type& key) const {
+        // return 0 for all nonobject types
+        return is_object() ? m_data.m_value.object->count(key) : 0;
+    }
+
+    /// @brief returns the number of occurrences of a key in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/count/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    size_type
+    count(KeyType&& key) const {
+        // return 0 for all nonobject types
+        return is_object() ? m_data.m_value.object->count(std::forward<KeyType>(key)) : 0;
+    }
+
+    /// @brief check the existence of an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/contains/
+    bool
+    contains(const typename object_t::key_type& key) const {
+        return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end();
+    }
+
+    /// @brief check the existence of an element in a JSON object
+    /// @sa https://json.nlohmann.me/api/basic_json/contains/
+    template <class KeyType,
+              detail::enable_if_t<detail::is_usable_as_basic_json_key_type<basic_json_t, KeyType>::value, int> = 0>
+    bool
+    contains(KeyType&& key) const {
+        return is_object() && m_data.m_value.object->find(std::forward<KeyType>(key)) != m_data.m_value.object->end();
+    }
+
+    /// @brief check the existence of an element in a JSON object given a JSON pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/contains/
+    bool
+    contains(const json_pointer& ptr) const {
+        return ptr.contains(this);
+    }
+
+    template <typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    bool contains(const typename ::nlohmann::json_pointer<BasicJsonType>& ptr) const {
+        return ptr.contains(this);
+    }
+
+    /// @}
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /// @brief returns an iterator to the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/begin/
+    iterator
+    begin() noexcept {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /// @brief returns an iterator to the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/begin/
+    const_iterator
+    begin() const noexcept {
+        return cbegin();
+    }
+
+    /// @brief returns a const iterator to the first element
+    /// @sa https://json.nlohmann.me/api/basic_json/cbegin/
+    const_iterator
+    cbegin() const noexcept {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /// @brief returns an iterator to one past the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/end/
+    iterator
+    end() noexcept {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /// @brief returns an iterator to one past the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/end/
+    const_iterator
+    end() const noexcept {
+        return cend();
+    }
+
+    /// @brief returns an iterator to one past the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/cend/
+    const_iterator
+    cend() const noexcept {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /// @brief returns an iterator to the reverse-beginning
+    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
+    reverse_iterator
+    rbegin() noexcept {
+        return reverse_iterator(end());
+    }
+
+    /// @brief returns an iterator to the reverse-beginning
+    /// @sa https://json.nlohmann.me/api/basic_json/rbegin/
+    const_reverse_iterator
+    rbegin() const noexcept {
+        return crbegin();
+    }
+
+    /// @brief returns an iterator to the reverse-end
+    /// @sa https://json.nlohmann.me/api/basic_json/rend/
+    reverse_iterator
+    rend() noexcept {
+        return reverse_iterator(begin());
+    }
+
+    /// @brief returns an iterator to the reverse-end
+    /// @sa https://json.nlohmann.me/api/basic_json/rend/
+    const_reverse_iterator
+    rend() const noexcept {
+        return crend();
+    }
+
+    /// @brief returns a const reverse iterator to the last element
+    /// @sa https://json.nlohmann.me/api/basic_json/crbegin/
+    const_reverse_iterator
+    crbegin() const noexcept {
+        return const_reverse_iterator(cend());
+    }
+
+    /// @brief returns a const reverse iterator to one before the first
+    /// @sa https://json.nlohmann.me/api/basic_json/crend/
+    const_reverse_iterator
+    crend() const noexcept {
+        return const_reverse_iterator(cbegin());
+    }
+
+   public:
+    /// @brief wrapper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use @ref items() instead;
+    ///             that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<iterator>
+    iterator_wrapper(reference ref) noexcept {
+        return ref.items();
+    }
+
+    /// @brief wrapper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    /// @deprecated This function is deprecated since 3.1.0 and will be removed in
+    ///         version 4.0.0 of the library. Please use @ref items() instead;
+    ///         that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<const_iterator>
+    iterator_wrapper(const_reference ref) noexcept {
+        return ref.items();
+    }
+
+    /// @brief helper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    iteration_proxy<iterator>
+    items() noexcept {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /// @brief helper to access iterator member functions in range-based for
+    /// @sa https://json.nlohmann.me/api/basic_json/items/
+    iteration_proxy<const_iterator>
+    items() const noexcept {
+        return iteration_proxy<const_iterator>(*this);
+    }
+
+    /// @}
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /// @brief checks whether the container is empty.
+    /// @sa https://json.nlohmann.me/api/basic_json/empty/
+    bool
+    empty() const noexcept {
+        switch (m_data.m_type) {
+            case value_t::null: {
+                // null values are empty
+                return true;
+            }
+
+            case value_t::array: {
+                // delegate call to array_t::empty()
+                return m_data.m_value.array->empty();
+            }
+
+            case value_t::object: {
+                // delegate call to object_t::empty()
+                return m_data.m_value.object->empty();
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /// @brief returns the number of elements
+    /// @sa https://json.nlohmann.me/api/basic_json/size/
+    size_type
+    size() const noexcept {
+        switch (m_data.m_type) {
+            case value_t::null: {
+                // null values are empty
+                return 0;
+            }
+
+            case value_t::array: {
+                // delegate call to array_t::size()
+                return m_data.m_value.array->size();
+            }
+
+            case value_t::object: {
+                // delegate call to object_t::size()
+                return m_data.m_value.object->size();
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /// @brief returns the maximum possible number of elements
+    /// @sa https://json.nlohmann.me/api/basic_json/max_size/
+    size_type
+    max_size() const noexcept {
+        switch (m_data.m_type) {
+            case value_t::array: {
+                // delegate call to array_t::max_size()
+                return m_data.m_value.array->max_size();
+            }
+
+            case value_t::object: {
+                // delegate call to object_t::max_size()
+                return m_data.m_value.object->max_size();
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /// @brief clears the contents
+    /// @sa https://json.nlohmann.me/api/basic_json/clear/
+    void
+    clear() noexcept {
+        switch (m_data.m_type) {
+            case value_t::number_integer: {
+                m_data.m_value.number_integer = 0;
+                break;
+            }
+
+            case value_t::number_unsigned: {
+                m_data.m_value.number_unsigned = 0;
+                break;
+            }
+
+            case value_t::number_float: {
+                m_data.m_value.number_float = 0.0;
+                break;
+            }
+
+            case value_t::boolean: {
+                m_data.m_value.boolean = false;
+                break;
+            }
+
+            case value_t::string: {
+                m_data.m_value.string->clear();
+                break;
+            }
+
+            case value_t::binary: {
+                m_data.m_value.binary->clear();
+                break;
+            }
+
+            case value_t::array: {
+                m_data.m_value.array->clear();
+                break;
+            }
+
+            case value_t::object: {
+                m_data.m_value.object->clear();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void
+    push_back(basic_json&& val) {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array()))) {
+            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
+        }
+
+        // transform null object into an array
+        if (is_null()) {
+            m_data.m_type  = value_t::array;
+            m_data.m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (move semantics)
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->push_back(std::move(val));
+        set_parent(m_data.m_value.array->back(), old_capacity);
+        // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference
+    operator+=(basic_json&& val) {
+        push_back(std::move(val));
+        return *this;
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void
+    push_back(const basic_json& val) {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array()))) {
+            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
+        }
+
+        // transform null object into an array
+        if (is_null()) {
+            m_data.m_type  = value_t::array;
+            m_data.m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->push_back(val);
+        set_parent(m_data.m_value.array->back(), old_capacity);
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference
+    operator+=(const basic_json& val) {
+        push_back(val);
+        return *this;
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void
+    push_back(const typename object_t::value_type& val) {
+        // push_back only works for null objects or objects
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object()))) {
+            JSON_THROW(type_error::create(308, detail::concat("cannot use push_back() with ", type_name()), this));
+        }
+
+        // transform null object into an object
+        if (is_null()) {
+            m_data.m_type  = value_t::object;
+            m_data.m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to object
+        auto res = m_data.m_value.object->insert(val);
+        set_parent(res.first->second);
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference
+    operator+=(const typename object_t::value_type& val) {
+        push_back(val);
+        return *this;
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/push_back/
+    void
+    push_back(initializer_list_t init) {
+        if (is_object() && init.size() == 2 && (*init.begin())->is_string()) {
+            basic_json&& key = init.begin()->moved_or_copied();
+            push_back(typename object_t::value_type(std::move(key.get_ref<string_t&>()),
+                                                    (init.begin() + 1)->moved_or_copied()));
+        } else {
+            push_back(basic_json(init));
+        }
+    }
+
+    /// @brief add an object to an object
+    /// @sa https://json.nlohmann.me/api/basic_json/operator+=/
+    reference
+    operator+=(initializer_list_t init) {
+        push_back(init);
+        return *this;
+    }
+
+    /// @brief add an object to an array
+    /// @sa https://json.nlohmann.me/api/basic_json/emplace_back/
+    template <class... Args>
+    reference
+    emplace_back(Args&&... args) {
+        // emplace_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array()))) {
+            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace_back() with ", type_name()), this));
+        }
+
+        // transform null object into an array
+        if (is_null()) {
+            m_data.m_type  = value_t::array;
+            m_data.m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        const auto old_capacity = m_data.m_value.array->capacity();
+        m_data.m_value.array->emplace_back(std::forward<Args>(args)...);
+        return set_parent(m_data.m_value.array->back(), old_capacity);
+    }
+
+    /// @brief add an object to an object if key does not exist
+    /// @sa https://json.nlohmann.me/api/basic_json/emplace/
+    template <class... Args>
+    std::pair<iterator, bool>
+    emplace(Args&&... args) {
+        // emplace only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object()))) {
+            JSON_THROW(type_error::create(311, detail::concat("cannot use emplace() with ", type_name()), this));
+        }
+
+        // transform null object into an object
+        if (is_null()) {
+            m_data.m_type  = value_t::object;
+            m_data.m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        auto res = m_data.m_value.object->emplace(std::forward<Args>(args)...);
+        set_parent(res.first->second);
+
+        // create result iterator and set iterator to the result of emplace
+        auto it                 = begin();
+        it.m_it.object_iterator = res.first;
+
+        // return pair of iterator and boolean
+        return {it, res.second};
+    }
+
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template <typename... Args>
+    iterator
+    insert_iterator(const_iterator pos, Args&&... args) {
+        iterator result(this);
+        JSON_ASSERT(m_data.m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator);
+        m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        set_parents();
+        return result;
+    }
+
+    /// @brief inserts element into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator
+    insert(const_iterator pos, const basic_json& val) {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, val);
+        }
+
+        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+    }
+
+    /// @brief inserts element into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator
+    insert(const_iterator pos, basic_json&& val) {
+        return insert(pos, val);
+    }
+
+    /// @brief inserts copies of element into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator
+    insert(const_iterator pos, size_type cnt, const basic_json& val) {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, cnt, val);
+        }
+
+        JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+    }
+
+    /// @brief inserts range of elements into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator
+    insert(const_iterator pos, const_iterator first, const_iterator last) {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array())) {
+            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(first.m_object == this)) {
+            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", this));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
+    }
+
+    /// @brief inserts elements from initializer list into array
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    iterator
+    insert(const_iterator pos, initializer_list_t ilist) {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array())) {
+            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this)) {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", this));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, ilist.begin(), ilist.end());
+    }
+
+    /// @brief inserts range of elements into object
+    /// @sa https://json.nlohmann.me/api/basic_json/insert/
+    void
+    insert(const_iterator first, const_iterator last) {
+        // insert only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(309, detail::concat("cannot use insert() with ", type_name()), this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object())) {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this));
+        }
+
+        m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+    }
+
+    /// @brief updates a JSON object from another object, overwriting existing keys
+    /// @sa https://json.nlohmann.me/api/basic_json/update/
+    void
+    update(const_reference j, bool merge_objects = false) {
+        update(j.begin(), j.end(), merge_objects);
+    }
+
+    /// @brief updates a JSON object from another object, overwriting existing keys
+    /// @sa https://json.nlohmann.me/api/basic_json/update/
+    void
+    update(const_iterator first, const_iterator last, bool merge_objects = false) {
+        // implicitly convert null value to an empty object
+        if (is_null()) {
+            m_data.m_type         = value_t::object;
+            m_data.m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object())) {
+            JSON_THROW(type_error::create(312, detail::concat("cannot use update() with ", type_name()), this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object)) {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", this));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object())) {
+            JSON_THROW(type_error::create(
+                312, detail::concat("cannot use update() with ", first.m_object->type_name()), first.m_object));
+        }
+
+        for (auto it = first; it != last; ++it) {
+            if (merge_objects && it.value().is_object()) {
+                auto it2 = m_data.m_value.object->find(it.key());
+                if (it2 != m_data.m_value.object->end()) {
+                    it2->second.update(it.value(), true);
+                    continue;
+                }
+            }
+            m_data.m_value.object->operator[](it.key()) = it.value();
+#if JSON_DIAGNOSTICS
+            m_data.m_value.object->operator[](it.key()).m_parent = this;
+#endif
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void
+    swap(reference other) noexcept(
+        std::is_nothrow_move_constructible<value_t>::value && std::is_nothrow_move_assignable<value_t>::value &&
+        std::is_nothrow_move_constructible<
+            json_value>::value &&  // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+        std::is_nothrow_move_assignable<json_value>::value) {
+        std::swap(m_data.m_type, other.m_data.m_type);
+        std::swap(m_data.m_value, other.m_data.m_value);
+
+        set_parents();
+        other.set_parents();
+        assert_invariant();
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    friend void
+    swap(reference left, reference right) noexcept(
+        std::is_nothrow_move_constructible<value_t>::value && std::is_nothrow_move_assignable<value_t>::value &&
+        std::is_nothrow_move_constructible<
+            json_value>::value &&  // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+        std::is_nothrow_move_assignable<json_value>::value) {
+        left.swap(right);
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void
+    swap(array_t& other)  // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array())) {
+            using std::swap;
+            swap(*(m_data.m_value.array), other);
+        } else {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(array_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void
+    swap(
+        object_t& other)  // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object())) {
+            using std::swap;
+            swap(*(m_data.m_value.object), other);
+        } else {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(object_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void
+    swap(
+        string_t& other)  // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_string())) {
+            using std::swap;
+            swap(*(m_data.m_value.string), other);
+        } else {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(string_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void
+    swap(
+        binary_t& other)  // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary())) {
+            using std::swap;
+            swap(*(m_data.m_value.binary), other);
+        } else {
+            JSON_THROW(type_error::create(310, detail::concat("cannot use swap(binary_t&) with ", type_name()), this));
+        }
+    }
+
+    /// @brief exchanges the values
+    /// @sa https://json.nlohmann.me/api/basic_json/swap/
+    void
+    swap(typename binary_t::container_type& other)  // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary())) {
+            using std::swap;
+            swap(*(m_data.m_value.binary), other);
+        } else {
+            JSON_THROW(type_error::create(
+                310, detail::concat("cannot use swap(binary_t::container_type&) with ", type_name()), this));
+        }
+    }
+
+    /// @}
+
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+    // note parentheses around operands are necessary; see
+    // https://github.com/nlohmann/json/issues/1530
+#define JSON_IMPLEMENT_OPERATOR(op, null_result, unordered_result, default_result)                                     \
+    const auto lhs_type = lhs.type();                                                                                  \
+    const auto rhs_type = rhs.type();                                                                                  \
+                                                                                                                       \
+    if (lhs_type == rhs_type) /* NOLINT(readability/braces) */                                                         \
+    {                                                                                                                  \
+        switch (lhs_type) {                                                                                            \
+            case value_t::array:                                                                                       \
+                return (*lhs.m_data.m_value.array)op(*rhs.m_data.m_value.array);                                       \
+                                                                                                                       \
+            case value_t::object:                                                                                      \
+                return (*lhs.m_data.m_value.object)op(*rhs.m_data.m_value.object);                                     \
+                                                                                                                       \
+            case value_t::null:                                                                                        \
+                return (null_result);                                                                                  \
+                                                                                                                       \
+            case value_t::string:                                                                                      \
+                return (*lhs.m_data.m_value.string)op(*rhs.m_data.m_value.string);                                     \
+                                                                                                                       \
+            case value_t::boolean:                                                                                     \
+                return (lhs.m_data.m_value.boolean)op(rhs.m_data.m_value.boolean);                                     \
+                                                                                                                       \
+            case value_t::number_integer:                                                                              \
+                return (lhs.m_data.m_value.number_integer)op(rhs.m_data.m_value.number_integer);                       \
+                                                                                                                       \
+            case value_t::number_unsigned:                                                                             \
+                return (lhs.m_data.m_value.number_unsigned)op(rhs.m_data.m_value.number_unsigned);                     \
+                                                                                                                       \
+            case value_t::number_float:                                                                                \
+                return (lhs.m_data.m_value.number_float)op(rhs.m_data.m_value.number_float);                           \
+                                                                                                                       \
+            case value_t::binary:                                                                                      \
+                return (*lhs.m_data.m_value.binary)op(*rhs.m_data.m_value.binary);                                     \
+                                                                                                                       \
+            case value_t::discarded:                                                                                   \
+            default:                                                                                                   \
+                return (unordered_result);                                                                             \
+        }                                                                                                              \
+    } else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float) {                             \
+        return static_cast<number_float_t>(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float;      \
+    } else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer) {                             \
+        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_integer);      \
+    } else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float) {                            \
+        return static_cast<number_float_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float;     \
+    } else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned) {                            \
+        return lhs.m_data.m_value.number_float op static_cast<number_float_t>(rhs.m_data.m_value.number_unsigned);     \
+    } else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer) {                          \
+        return static_cast<number_integer_t>(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \
+    } else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned) {                          \
+        return lhs.m_data.m_value.number_integer op static_cast<number_integer_t>(rhs.m_data.m_value.number_unsigned); \
+    } else if (compares_unordered(lhs, rhs)) {                                                                         \
+        return (unordered_result);                                                                                     \
+    }                                                                                                                  \
+                                                                                                                       \
+    return (default_result);
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        // returns true if:
+        // - any operand is NaN and the other operand is of number type
+        // - any operand is discarded
+        // in legacy mode, discarded values are considered ordered if
+        // an operation is computed as an odd number of inverses of others
+        static bool
+        compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept {
+        if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number()) ||
+            (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number())) {
+            return true;
+        }
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+        return (lhs.is_discarded() || rhs.is_discarded()) && !inverse;
+#else
+        static_cast<void>(inverse);
+        return lhs.is_discarded() || rhs.is_discarded();
+#endif
+    }
+
+   private:
+    bool
+    compares_unordered(const_reference rhs, bool inverse = false) const noexcept {
+        return compares_unordered(*this, rhs, inverse);
+    }
+
+   public:
+#if JSON_HAS_THREE_WAY_COMPARISON
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    bool
+    operator==(const_reference rhs) const noexcept {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        const_reference lhs = *this;
+        JSON_IMPLEMENT_OPERATOR(==, true, false, false)
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    template <typename ScalarType>
+        requires std::is_scalar_v<ScalarType>
+    bool
+    operator==(ScalarType rhs) const noexcept {
+        return *this == basic_json(rhs);
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    bool
+    operator!=(const_reference rhs) const noexcept {
+        if (compares_unordered(rhs, true)) {
+            return false;
+        }
+        return !operator==(rhs);
+    }
+
+    /// @brief comparison: 3-way
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
+    std::partial_ordering
+    operator<=>(const_reference rhs) const noexcept  // *NOPAD*
+    {
+        const_reference lhs = *this;
+        // default_result is used if we cannot compare values. In that case,
+        // we compare types.
+        JSON_IMPLEMENT_OPERATOR(<=>,  // *NOPAD*
+                                std::partial_ordering::equivalent,
+                                std::partial_ordering::unordered,
+                                lhs_type <=> rhs_type)  // *NOPAD*
+    }
+
+    /// @brief comparison: 3-way
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_spaceship/
+    template <typename ScalarType>
+        requires std::is_scalar_v<ScalarType>
+    std::partial_ordering
+    operator<=>(ScalarType rhs) const noexcept  // *NOPAD*
+    {
+        return *this <=> basic_json(rhs);  // *NOPAD*
+    }
+
+#if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+    // all operators that are computed as an odd number of inverses of others
+    // need to be overloaded to emulate the legacy comparison behavior
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
+    bool
+    operator<=(const_reference rhs) const noexcept {
+        if (compares_unordered(rhs, true)) {
+            return false;
+        }
+        return !(rhs < *this);
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    template <typename ScalarType>
+        requires std::is_scalar_v<ScalarType>
+    bool
+    operator<=(ScalarType rhs) const noexcept {
+        return *this <= basic_json(rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0, undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON)
+    bool
+    operator>=(const_reference rhs) const noexcept {
+        if (compares_unordered(rhs, true)) {
+            return false;
+        }
+        return !(*this < rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    template <typename ScalarType>
+        requires std::is_scalar_v<ScalarType>
+    bool
+    operator>=(ScalarType rhs) const noexcept {
+        return *this >= basic_json(rhs);
+    }
+#endif
+#else
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    friend bool
+    operator==(const_reference lhs, const_reference rhs) noexcept {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        JSON_IMPLEMENT_OPERATOR(==, true, false, false)
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator==(const_reference lhs, ScalarType rhs) noexcept {
+        return lhs == basic_json(rhs);
+    }
+
+    /// @brief comparison: equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_eq/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator==(ScalarType lhs, const_reference rhs) noexcept {
+        return basic_json(lhs) == rhs;
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    friend bool
+    operator!=(const_reference lhs, const_reference rhs) noexcept {
+        if (compares_unordered(lhs, rhs, true)) {
+            return false;
+        }
+        return !(lhs == rhs);
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator!=(const_reference lhs, ScalarType rhs) noexcept {
+        return lhs != basic_json(rhs);
+    }
+
+    /// @brief comparison: not equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ne/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator!=(ScalarType lhs, const_reference rhs) noexcept {
+        return basic_json(lhs) != rhs;
+    }
+
+    /// @brief comparison: less than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
+    friend bool
+    operator<(const_reference lhs, const_reference rhs) noexcept {
+        // default_result is used if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        JSON_IMPLEMENT_OPERATOR(<, false, false, operator<(lhs_type, rhs_type))
+    }
+
+    /// @brief comparison: less than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator<(const_reference lhs, ScalarType rhs) noexcept {
+        return lhs < basic_json(rhs);
+    }
+
+    /// @brief comparison: less than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_lt/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator<(ScalarType lhs, const_reference rhs) noexcept {
+        return basic_json(lhs) < rhs;
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    friend bool
+    operator<=(const_reference lhs, const_reference rhs) noexcept {
+        if (compares_unordered(lhs, rhs, true)) {
+            return false;
+        }
+        return !(rhs < lhs);
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator<=(const_reference lhs, ScalarType rhs) noexcept {
+        return lhs <= basic_json(rhs);
+    }
+
+    /// @brief comparison: less than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_le/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator<=(ScalarType lhs, const_reference rhs) noexcept {
+        return basic_json(lhs) <= rhs;
+    }
+
+    /// @brief comparison: greater than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
+    friend bool
+    operator>(const_reference lhs, const_reference rhs) noexcept {
+        // double inverse
+        if (compares_unordered(lhs, rhs)) {
+            return false;
+        }
+        return !(lhs <= rhs);
+    }
+
+    /// @brief comparison: greater than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator>(const_reference lhs, ScalarType rhs) noexcept {
+        return lhs > basic_json(rhs);
+    }
+
+    /// @brief comparison: greater than
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gt/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator>(ScalarType lhs, const_reference rhs) noexcept {
+        return basic_json(lhs) > rhs;
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    friend bool
+    operator>=(const_reference lhs, const_reference rhs) noexcept {
+        if (compares_unordered(lhs, rhs, true)) {
+            return false;
+        }
+        return !(lhs < rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator>=(const_reference lhs, ScalarType rhs) noexcept {
+        return lhs >= basic_json(rhs);
+    }
+
+    /// @brief comparison: greater than or equal
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ge/
+    template <typename ScalarType, typename std::enable_if<std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool
+    operator>=(ScalarType lhs, const_reference rhs) noexcept {
+        return basic_json(lhs) >= rhs;
+    }
+#endif
+
+#undef JSON_IMPLEMENT_OPERATOR
+
+    /// @}
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+#ifndef JSON_NO_IO
+    /// @brief serialize to stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
+    friend std::ostream&
+    operator<<(std::ostream& o, const basic_json& j) {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = o.width() > 0;
+        const auto indentation  = pretty_print ? o.width() : 0;
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        serializer s(detail::output_adapter<char>(o), o.fill());
+        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /// @brief serialize to stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_ltlt/
+    /// @deprecated This function is deprecated since 3.0.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use
+    ///             operator<<(std::ostream&, const basic_json&) instead; that is,
+    ///             replace calls like `j >> o;` with `o << j;`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
+    friend std::ostream&
+    operator>>(const basic_json& j, std::ostream& o) {
+        return o << j;
+    }
+#endif  // JSON_NO_IO
+    /// @}
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /// @brief deserialize from a compatible input
+    /// @sa https://json.nlohmann.me/api/basic_json/parse/
+    template <typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    parse(InputType&& i,
+          const parser_callback_t cb  = nullptr,
+          const bool allow_exceptions = true,
+          const bool ignore_comments  = false) {
+        basic_json result;
+        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments)
+            .parse(true, result);
+        return result;
+    }
+
+    /// @brief deserialize from a pair of character iterators
+    /// @sa https://json.nlohmann.me/api/basic_json/parse/
+    template <typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    parse(IteratorType first,
+          IteratorType last,
+          const parser_callback_t cb  = nullptr,
+          const bool allow_exceptions = true,
+          const bool ignore_comments  = false) {
+        basic_json result;
+        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments)
+            .parse(true, result);
+        return result;
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
+    static basic_json
+    parse(detail::span_input_adapter&& i,
+          const parser_callback_t cb  = nullptr,
+          const bool allow_exceptions = true,
+          const bool ignore_comments  = false) {
+        basic_json result;
+        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /// @brief check if the input is valid JSON
+    /// @sa https://json.nlohmann.me/api/basic_json/accept/
+    template <typename InputType>
+    static bool
+    accept(InputType&& i, const bool ignore_comments = false) {
+        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /// @brief check if the input is valid JSON
+    /// @sa https://json.nlohmann.me/api/basic_json/accept/
+    template <typename IteratorType>
+    static bool
+    accept(IteratorType first, IteratorType last, const bool ignore_comments = false) {
+        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments)
+            .accept(true);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
+    static bool
+    accept(detail::span_input_adapter&& i, const bool ignore_comments = false) {
+        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /// @brief generate SAX events
+    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
+    template <typename InputType, typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(InputType&& i,
+                          SAX* sax,
+                          input_format_t format      = input_format_t::json,
+                          const bool strict          = true,
+                          const bool ignore_comments = false) {
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        return format == input_format_t::json
+                   ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+                   : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format)
+                         .sax_parse(format, sax, strict);
+    }
+
+    /// @brief generate SAX events
+    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
+    template <class IteratorType, class SAX>
+    JSON_HEDLEY_NON_NULL(3)
+    static bool sax_parse(IteratorType first,
+                          IteratorType last,
+                          SAX* sax,
+                          input_format_t format      = input_format_t::json,
+                          const bool strict          = true,
+                          const bool ignore_comments = false) {
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        return format == input_format_t::json
+                   ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+                   : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format)
+                         .sax_parse(format, sax, strict);
+    }
+
+    /// @brief generate SAX events
+    /// @sa https://json.nlohmann.me/api/basic_json/sax_parse/
+    /// @deprecated This function is deprecated since 3.8.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use
+    ///             sax_parse(ptr, ptr + len) instead.
+    template <typename SAX>
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
+    JSON_HEDLEY_NON_NULL(2) static bool sax_parse(detail::span_input_adapter&& i,
+                                                  SAX* sax,
+                                                  input_format_t format      = input_format_t::json,
+                                                  const bool strict          = true,
+                                                  const bool ignore_comments = false) {
+        auto ia = i.get();
+        return format == input_format_t::json
+                   // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+                   ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+                   // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+                   : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia), format)
+                         .sax_parse(format, sax, strict);
+    }
+#ifndef JSON_NO_IO
+    /// @brief deserialize from stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
+    /// @deprecated This stream operator is deprecated since 3.0.0 and will be removed in
+    ///             version 4.0.0 of the library. Please use
+    ///             operator>>(std::istream&, basic_json&) instead; that is,
+    ///             replace calls like `j << i;` with `i >> j;`.
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
+    friend std::istream&
+    operator<<(basic_json& j, std::istream& i) {
+        return operator>>(i, j);
+    }
+
+    /// @brief deserialize from stream
+    /// @sa https://json.nlohmann.me/api/basic_json/operator_gtgt/
+    friend std::istream&
+    operator>>(std::istream& i, basic_json& j) {
+        parser(detail::input_adapter(i)).parse(false, j);
+        return i;
+    }
+#endif  // JSON_NO_IO
+    /// @}
+
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /// @brief return the type as string
+    /// @sa https://json.nlohmann.me/api/basic_json/type_name/
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char*
+    type_name() const noexcept {
+        switch (m_data.m_type) {
+            case value_t::null:
+                return "null";
+            case value_t::object:
+                return "object";
+            case value_t::array:
+                return "array";
+            case value_t::string:
+                return "string";
+            case value_t::boolean:
+                return "boolean";
+            case value_t::binary:
+                return "binary";
+            case value_t::discarded:
+                return "discarded";
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            default:
+                return "number";
+        }
+    }
+
+    JSON_PRIVATE_UNLESS_TESTED :
+        //////////////////////
+        // member variables //
+        //////////////////////
+
+        struct data {
+        /// the type of the current element
+        value_t m_type = value_t::null;
+
+        /// the value of the current element
+        json_value m_value = {};
+
+        data(const value_t v) : m_type(v), m_value(v) {}
+
+        data(size_type cnt, const basic_json& val) : m_type(value_t::array) {
+            m_value.array = create<array_t>(cnt, val);
+        }
+
+        data() noexcept            = default;
+        data(data&&) noexcept      = default;
+        data(const data&) noexcept = delete;
+        data&
+        operator=(data&&) noexcept = delete;
+        data&
+        operator=(const data&) noexcept = delete;
+
+        ~data() noexcept { m_value.destroy(m_type); }
+    };
+
+    data m_data = {};
+
+#if JSON_DIAGNOSTICS
+    /// a pointer to a parent value (for debugging purposes)
+    basic_json* m_parent = nullptr;
+#endif
+
+    //////////////////////////////////////////
+    // binary serialization/deserialization //
+    //////////////////////////////////////////
+
+    /// @name binary serialization/deserialization support
+    /// @{
+
+   public:
+    /// @brief create a CBOR serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
+    static std::vector<std::uint8_t>
+    to_cbor(const basic_json& j) {
+        std::vector<std::uint8_t> result;
+        to_cbor(j, result);
+        return result;
+    }
+
+    /// @brief create a CBOR serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
+    static void
+    to_cbor(const basic_json& j, detail::output_adapter<std::uint8_t> o) {
+        binary_writer<std::uint8_t>(o).write_cbor(j);
+    }
+
+    /// @brief create a CBOR serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_cbor/
+    static void
+    to_cbor(const basic_json& j, detail::output_adapter<char> o) {
+        binary_writer<char>(o).write_cbor(j);
+    }
+
+    /// @brief create a MessagePack serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
+    static std::vector<std::uint8_t>
+    to_msgpack(const basic_json& j) {
+        std::vector<std::uint8_t> result;
+        to_msgpack(j, result);
+        return result;
+    }
+
+    /// @brief create a MessagePack serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
+    static void
+    to_msgpack(const basic_json& j, detail::output_adapter<std::uint8_t> o) {
+        binary_writer<std::uint8_t>(o).write_msgpack(j);
+    }
+
+    /// @brief create a MessagePack serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_msgpack/
+    static void
+    to_msgpack(const basic_json& j, detail::output_adapter<char> o) {
+        binary_writer<char>(o).write_msgpack(j);
+    }
+
+    /// @brief create a UBJSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
+    static std::vector<std::uint8_t>
+    to_ubjson(const basic_json& j, const bool use_size = false, const bool use_type = false) {
+        std::vector<std::uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    /// @brief create a UBJSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
+    static void
+    to_ubjson(const basic_json& j,
+              detail::output_adapter<std::uint8_t> o,
+              const bool use_size = false,
+              const bool use_type = false) {
+        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    /// @brief create a UBJSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_ubjson/
+    static void
+    to_ubjson(const basic_json& j,
+              detail::output_adapter<char> o,
+              const bool use_size = false,
+              const bool use_type = false) {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    /// @brief create a BJData serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
+    static std::vector<std::uint8_t>
+    to_bjdata(const basic_json& j, const bool use_size = false, const bool use_type = false) {
+        std::vector<std::uint8_t> result;
+        to_bjdata(j, result, use_size, use_type);
+        return result;
+    }
+
+    /// @brief create a BJData serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
+    static void
+    to_bjdata(const basic_json& j,
+              detail::output_adapter<std::uint8_t> o,
+              const bool use_size = false,
+              const bool use_type = false) {
+        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type, true, true);
+    }
+
+    /// @brief create a BJData serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bjdata/
+    static void
+    to_bjdata(const basic_json& j,
+              detail::output_adapter<char> o,
+              const bool use_size = false,
+              const bool use_type = false) {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type, true, true);
+    }
+
+    /// @brief create a BSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
+    static std::vector<std::uint8_t>
+    to_bson(const basic_json& j) {
+        std::vector<std::uint8_t> result;
+        to_bson(j, result);
+        return result;
+    }
+
+    /// @brief create a BSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
+    static void
+    to_bson(const basic_json& j, detail::output_adapter<std::uint8_t> o) {
+        binary_writer<std::uint8_t>(o).write_bson(j);
+    }
+
+    /// @brief create a BSON serialization of a given JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/to_bson/
+    static void
+    to_bson(const basic_json& j, detail::output_adapter<char> o) {
+        binary_writer<char>(o).write_bson(j);
+    }
+
+    /// @brief create a JSON value from an input in CBOR format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
+    template <typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_cbor(InputType&& i,
+              const bool strict                    = true,
+              const bool allow_exceptions          = true,
+              const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor)
+                             .sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in CBOR format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_cbor/
+    template <typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_cbor(IteratorType first,
+              IteratorType last,
+              const bool strict                    = true,
+              const bool allow_exceptions          = true,
+              const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor)
+                             .sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template <typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len)) static basic_json
+        from_cbor(const T* ptr,
+                  std::size_t len,
+                  const bool strict                    = true,
+                  const bool allow_exceptions          = true,
+                  const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) {
+        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json
+    from_cbor(detail::span_input_adapter&& i,
+              const bool strict                    = true,
+              const bool allow_exceptions          = true,
+              const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::cbor)
+                             .sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in MessagePack format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
+    template <typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_msgpack(InputType&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack)
+                             .sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in MessagePack format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_msgpack/
+    template <typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_msgpack(IteratorType first, IteratorType last, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack)
+                             .sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template <typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len)) static basic_json
+        from_msgpack(const T* ptr, std::size_t len, const bool strict = true, const bool allow_exceptions = true) {
+        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json
+    from_msgpack(detail::span_input_adapter&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::msgpack)
+                             .sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in UBJSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
+    template <typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_ubjson(InputType&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson)
+                             .sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in UBJSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_ubjson/
+    template <typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_ubjson(IteratorType first, IteratorType last, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson)
+                             .sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template <typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len)) static basic_json
+        from_ubjson(const T* ptr, std::size_t len, const bool strict = true, const bool allow_exceptions = true) {
+        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json
+    from_ubjson(detail::span_input_adapter&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::ubjson)
+                             .sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BJData format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
+    template <typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_bjdata(InputType&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata)
+                             .sax_parse(input_format_t::bjdata, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BJData format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/
+    template <typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_bjdata(IteratorType first, IteratorType last, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bjdata)
+                             .sax_parse(input_format_t::bjdata, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
+    template <typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_bson(InputType&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson)
+                             .sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /// @brief create a JSON value from an input in BSON format
+    /// @sa https://json.nlohmann.me/api/basic_json/from_bson/
+    template <typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT static basic_json
+    from_bson(IteratorType first, IteratorType last, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia        = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson)
+                             .sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template <typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len)) static basic_json
+        from_bson(const T* ptr, std::size_t len, const bool strict = true, const bool allow_exceptions = true) {
+        return from_bson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json
+    from_bson(detail::span_input_adapter&& i, const bool strict = true, const bool allow_exceptions = true) {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia), input_format_t::bson)
+                             .sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+    /// @}
+
+    //////////////////////////
+    // JSON Pointer support //
+    //////////////////////////
+
+    /// @name JSON Pointer functions
+    /// @{
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    reference
+    operator[](const json_pointer& ptr) {
+        return ptr.get_unchecked(this);
+    }
+
+    template <typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    reference
+    operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr) {
+        return ptr.get_unchecked(this);
+    }
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/operator%5B%5D/
+    const_reference
+    operator[](const json_pointer& ptr) const {
+        return ptr.get_unchecked(this);
+    }
+
+    template <typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    const_reference
+    operator[](const ::nlohmann::json_pointer<BasicJsonType>& ptr) const {
+        return ptr.get_unchecked(this);
+    }
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    reference
+    at(const json_pointer& ptr) {
+        return ptr.get_checked(this);
+    }
+
+    template <typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr) {
+        return ptr.get_checked(this);
+    }
+
+    /// @brief access specified element via JSON Pointer
+    /// @sa https://json.nlohmann.me/api/basic_json/at/
+    const_reference
+    at(const json_pointer& ptr) const {
+        return ptr.get_checked(this);
+    }
+
+    template <typename BasicJsonType, detail::enable_if_t<detail::is_basic_json<BasicJsonType>::value, int> = 0>
+    JSON_HEDLEY_DEPRECATED_FOR(3.11.0,
+                               basic_json::json_pointer or
+                                   nlohmann::json_pointer<basic_json::string_t>)  // NOLINT(readability/alt_tokens)
+    const_reference at(const ::nlohmann::json_pointer<BasicJsonType>& ptr) const {
+        return ptr.get_checked(this);
+    }
+
+    /// @brief return flattened JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/flatten/
+    basic_json
+    flatten() const {
+        basic_json result(value_t::object);
+        json_pointer::flatten("", *this, result);
+        return result;
+    }
+
+    /// @brief unflatten a previously flattened JSON value
+    /// @sa https://json.nlohmann.me/api/basic_json/unflatten/
+    basic_json
+    unflatten() const {
+        return json_pointer::unflatten(*this);
+    }
+
+    /// @}
+
+    //////////////////////////
+    // JSON Patch functions //
+    //////////////////////////
+
+    /// @name JSON Patch functions
+    /// @{
+
+    /// @brief applies a JSON patch in-place without copying the object
+    /// @sa https://json.nlohmann.me/api/basic_json/patch/
+    void
+    patch_inplace(const basic_json& json_patch) {
+        basic_json& result = *this;
+        // the valid JSON Patch operations
+        enum class patch_operations { add, remove, replace, move, copy, test, invalid };
+
+        const auto get_op = [](const std::string& op) {
+            if (op == "add") {
+                return patch_operations::add;
+            }
+            if (op == "remove") {
+                return patch_operations::remove;
+            }
+            if (op == "replace") {
+                return patch_operations::replace;
+            }
+            if (op == "move") {
+                return patch_operations::move;
+            }
+            if (op == "copy") {
+                return patch_operations::copy;
+            }
+            if (op == "test") {
+                return patch_operations::test;
+            }
+
+            return patch_operations::invalid;
+        };
+
+        // wrapper for "add" operation; add value at ptr
+        const auto operation_add = [&result](json_pointer& ptr, basic_json val) {
+            // adding to the root of the target document means replacing it
+            if (ptr.empty()) {
+                result = val;
+                return;
+            }
+
+            // make sure the top element of the pointer exists
+            json_pointer const top_pointer = ptr.top();
+            if (top_pointer != ptr) {
+                result.at(top_pointer);
+            }
+
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            // parent must exist when performing patch add per RFC6902 specs
+            basic_json& parent = result.at(ptr);
+
+            switch (parent.m_data.m_type) {
+                case value_t::null:
+                case value_t::object: {
+                    // use operator[] to add value
+                    parent[last_path] = val;
+                    break;
+                }
+
+                case value_t::array: {
+                    if (last_path == "-") {
+                        // special case: append to back
+                        parent.push_back(val);
+                    } else {
+                        const auto idx = json_pointer::template array_index<basic_json_t>(last_path);
+                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size())) {
+                            // avoid undefined behavior
+                            JSON_THROW(out_of_range::create(
+                                401, detail::concat("array index ", std::to_string(idx), " is out of range"), &parent));
+                        }
+
+                        // default case: insert add offset
+                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
+                    }
+                    break;
+                }
+
+                // if there exists a parent it cannot be primitive
+                case value_t::string:           // LCOV_EXCL_LINE
+                case value_t::boolean:          // LCOV_EXCL_LINE
+                case value_t::number_integer:   // LCOV_EXCL_LINE
+                case value_t::number_unsigned:  // LCOV_EXCL_LINE
+                case value_t::number_float:     // LCOV_EXCL_LINE
+                case value_t::binary:           // LCOV_EXCL_LINE
+                case value_t::discarded:        // LCOV_EXCL_LINE
+                default:                        // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);  // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            }
+        };
+
+        // wrapper for "remove" operation; remove value at ptr
+        const auto operation_remove = [this, &result](json_pointer& ptr) {
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result.at(ptr);
+
+            // remove child
+            if (parent.is_object()) {
+                // perform range check
+                auto it = parent.find(last_path);
+                if (JSON_HEDLEY_LIKELY(it != parent.end())) {
+                    parent.erase(it);
+                } else {
+                    JSON_THROW(out_of_range::create(403, detail::concat("key '", last_path, "' not found"), this));
+                }
+            } else if (parent.is_array()) {
+                // note erase performs range check
+                parent.erase(json_pointer::template array_index<basic_json_t>(last_path));
+            }
+        };
+
+        // type check: top level value must be an array
+        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array())) {
+            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &json_patch));
+        }
+
+        // iterate and apply the operations
+        for (const auto& val : json_patch) {
+            // wrapper to get a value for an operation
+            const auto get_value =
+                [&val](const std::string& op, const std::string& member, bool string_type) -> basic_json& {
+                // find value
+                auto it = val.m_data.m_value.object->find(member);
+
+                // context-sensitive error message
+                const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\'');
+
+                // check if desired value is present
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end())) {
+                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+                    JSON_THROW(parse_error::create(
+                        105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val));
+                }
+
+                // check if result is of type string
+                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string())) {
+                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+                    JSON_THROW(parse_error::create(
+                        105, 0, detail::concat(error_msg, " must have string member '", member, "'"), &val));
+                }
+
+                // no error: return value
+                return it->second;
+            };
+
+            // type check: every element of the array must be an object
+            if (JSON_HEDLEY_UNLIKELY(!val.is_object())) {
+                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", &val));
+            }
+
+            // collect mandatory members
+            const auto op   = get_value("op", "op", true).template get<std::string>();
+            const auto path = get_value(op, "path", true).template get<std::string>();
+            json_pointer ptr(path);
+
+            switch (get_op(op)) {
+                case patch_operations::add: {
+                    operation_add(ptr, get_value("add", "value", false));
+                    break;
+                }
+
+                case patch_operations::remove: {
+                    operation_remove(ptr);
+                    break;
+                }
+
+                case patch_operations::replace: {
+                    // the "path" location must exist - use at()
+                    result.at(ptr) = get_value("replace", "value", false);
+                    break;
+                }
+
+                case patch_operations::move: {
+                    const auto from_path = get_value("move", "from", true).template get<std::string>();
+                    json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json const v = result.at(from_ptr);
+
+                    // The move operation is functionally identical to a
+                    // "remove" operation on the "from" location, followed
+                    // immediately by an "add" operation at the target
+                    // location with the value that was just removed.
+                    operation_remove(from_ptr);
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::copy: {
+                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
+                    const json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json const v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::test: {
+                    bool success = false;
+                    JSON_TRY {
+                        // check if "value" matches the one at "path"
+                        // the "path" location must exist - use at()
+                        success = (result.at(ptr) == get_value("test", "value", false));
+                    }
+                    JSON_INTERNAL_CATCH(out_of_range&) {
+                        // ignore out of range errors: success remains false
+                    }
+
+                    // throw an exception if test fails
+                    if (JSON_HEDLEY_UNLIKELY(!success)) {
+                        JSON_THROW(other_error::create(501, detail::concat("unsuccessful: ", val.dump()), &val));
+                    }
+
+                    break;
+                }
+
+                case patch_operations::invalid:
+                default: {
+                    // op must be "add", "remove", "replace", "move", "copy", or
+                    // "test"
+                    JSON_THROW(
+                        parse_error::create(105, 0, detail::concat("operation value '", op, "' is invalid"), &val));
+                }
+            }
+        }
+    }
+
+    /// @brief applies a JSON patch to a copy of the current object
+    /// @sa https://json.nlohmann.me/api/basic_json/patch/
+    basic_json
+    patch(const basic_json& json_patch) const {
+        basic_json result = *this;
+        result.patch_inplace(json_patch);
+        return result;
+    }
+
+    /// @brief creates a diff as a JSON patch
+    /// @sa https://json.nlohmann.me/api/basic_json/diff/
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json
+    diff(const basic_json& source, const basic_json& target, const std::string& path = "") {
+        // the patch
+        basic_json result(value_t::array);
+
+        // if the values are the same, return empty patch
+        if (source == target) {
+            return result;
+        }
+
+        if (source.type() != target.type()) {
+            // different types: replace value
+            result.push_back({{"op", "replace"}, {"path", path}, {"value", target}});
+            return result;
+        }
+
+        switch (source.type()) {
+            case value_t::array: {
+                // first pass: traverse common elements
+                std::size_t i = 0;
+                while (i < source.size() && i < target.size()) {
+                    // recursive call to compare array values at index i
+                    auto temp_diff = diff(source[i], target[i], detail::concat(path, '/', std::to_string(i)));
+                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    ++i;
+                }
+
+                // We now reached the end of at least one array
+                // in a second pass, traverse the remaining elements
+
+                // remove my remaining elements
+                const auto end_index = static_cast<difference_type>(result.size());
+                while (i < source.size()) {
+                    // add operations in reverse order to avoid invalid
+                    // indices
+                    result.insert(result.begin() + end_index,
+                                  object({{"op", "remove"}, {"path", detail::concat(path, '/', std::to_string(i))}}));
+                    ++i;
+                }
+
+                // add other remaining elements
+                while (i < target.size()) {
+                    result.push_back({{"op", "add"}, {"path", detail::concat(path, "/-")}, {"value", target[i]}});
+                    ++i;
+                }
+
+                break;
+            }
+
+            case value_t::object: {
+                // first pass: traverse this object's elements
+                for (auto it = source.cbegin(); it != source.cend(); ++it) {
+                    // escape the key name to be used in a JSON patch
+                    const auto path_key = detail::concat(path, '/', detail::escape(it.key()));
+
+                    if (target.find(it.key()) != target.end()) {
+                        // recursive call to compare object values at key it
+                        auto temp_diff = diff(it.value(), target[it.key()], path_key);
+                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    } else {
+                        // found a key that is not in o -> remove it
+                        result.push_back(object({{"op", "remove"}, {"path", path_key}}));
+                    }
+                }
+
+                // second pass: traverse other object's elements
+                for (auto it = target.cbegin(); it != target.cend(); ++it) {
+                    if (source.find(it.key()) == source.end()) {
+                        // found a key that is not in this -> add it
+                        const auto path_key = detail::concat(path, '/', detail::escape(it.key()));
+                        result.push_back({{"op", "add"}, {"path", path_key}, {"value", it.value()}});
+                    }
+                }
+
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default: {
+                // both primitive type: replace value
+                result.push_back({{"op", "replace"}, {"path", path}, {"value", target}});
+                break;
+            }
+        }
+
+        return result;
+    }
+    /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /// @brief applies a JSON Merge Patch
+    /// @sa https://json.nlohmann.me/api/basic_json/merge_patch/
+    void
+    merge_patch(const basic_json& apply_patch) {
+        if (apply_patch.is_object()) {
+            if (!is_object()) {
+                *this = object();
+            }
+            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it) {
+                if (it.value().is_null()) {
+                    erase(it.key());
+                } else {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        } else {
+            *this = apply_patch;
+        }
+    }
+
+    /// @}
+};
+
+/// @brief user-defined to_string function for JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/to_string/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+std::string
+to_string(const NLOHMANN_BASIC_JSON_TPL& j) {
+    return j.dump();
+}
+
+inline namespace literals {
+inline namespace json_literals {
+
+/// @brief user-defined string literal for JSON values
+/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/
+JSON_HEDLEY_NON_NULL(1)
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
+inline nlohmann::json
+operator""_json(const char* s, std::size_t n)
+#else
+inline nlohmann::json
+operator"" _json(const char* s, std::size_t n)
+#endif
+{
+    return nlohmann::json::parse(s, s + n);
+}
+
+/// @brief user-defined string literal for JSON pointer
+/// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/
+JSON_HEDLEY_NON_NULL(1)
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
+inline nlohmann::json::json_pointer
+operator""_json_pointer(const char* s, std::size_t n)
+#else
+inline nlohmann::json::json_pointer
+operator"" _json_pointer(const char* s, std::size_t n)
+#endif
+{
+    return nlohmann::json::json_pointer(std::string(s, n));
+}
+
+}  // namespace json_literals
+}  // namespace literals
+NLOHMANN_JSON_NAMESPACE_END
+
+///////////////////////
+// nonmember support //
+///////////////////////
+
+namespace std  // NOLINT(cert-dcl58-cpp)
+{
+
+/// @brief hash value for JSON objects
+/// @sa https://json.nlohmann.me/api/basic_json/std_hash/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct hash<nlohmann::NLOHMANN_BASIC_JSON_TPL>  // NOLINT(cert-dcl58-cpp)
+{
+    std::size_t
+    operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const {
+        return nlohmann::detail::hash(j);
+    }
+};
+
+// specialization for std::less<value_t>
+template <>
+struct less<::nlohmann::detail::value_t>  // do not remove the space after '<', see
+                                          // https://github.com/nlohmann/json/pull/679
+{
+    /*!
+    @brief compare two value_t enum values
+    @since version 3.0.0
+    */
+    bool
+    operator()(::nlohmann::detail::value_t lhs, ::nlohmann::detail::value_t rhs) const noexcept {
+#if JSON_HAS_THREE_WAY_COMPARISON
+        return std::is_lt(lhs <=> rhs);  // *NOPAD*
+#else
+        return ::nlohmann::detail::operator<(lhs, rhs);
+#endif
+    }
+};
+
+// C++20 prohibit function specialization in the std namespace.
+#ifndef JSON_HAS_CPP_20
+
+/// @brief exchanges the values of two JSON objects
+/// @sa https://json.nlohmann.me/api/basic_json/std_swap/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+inline void
+swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1,
+     nlohmann::NLOHMANN_BASIC_JSON_TPL&
+         j2) noexcept(  // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp)
+    is_nothrow_move_constructible<nlohmann::NLOHMANN_BASIC_JSON_TPL>::
+        value &&  // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap)
+    is_nothrow_move_assignable<nlohmann::NLOHMANN_BASIC_JSON_TPL>::value) {
+    j1.swap(j2);
+}
+
+#endif
+
+}  // namespace std
+
+#if JSON_USE_GLOBAL_UDLS
+#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
+using nlohmann::literals::json_literals::
+operator""_json;  // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+using nlohmann::literals::json_literals::
+operator""_json_pointer;  // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+#else
+using nlohmann::literals::json_literals::
+operator"" _json;  // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+using nlohmann::literals::json_literals::
+operator"" _json_pointer;  // NOLINT(misc-unused-using-decls,google-global-names-in-headers)
+#endif
+#endif
+
+// #include <nlohmann/detail/macro_unscope.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+// restore clang diagnostic settings
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+// clean up
+#undef JSON_ASSERT
+#undef JSON_INTERNAL_CATCH
+#undef JSON_THROW
+#undef JSON_PRIVATE_UNLESS_TESTED
+#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
+#undef NLOHMANN_BASIC_JSON_TPL
+#undef JSON_EXPLICIT
+#undef NLOHMANN_CAN_CALL_STD_FUNC_IMPL
+#undef JSON_INLINE_VARIABLE
+#undef JSON_NO_UNIQUE_ADDRESS
+#undef JSON_DISABLE_ENUM_SERIALIZATION
+#undef JSON_USE_GLOBAL_UDLS
+
+#ifndef JSON_TEST_KEEP_MACROS
+#undef JSON_CATCH
+#undef JSON_TRY
+#undef JSON_HAS_CPP_11
+#undef JSON_HAS_CPP_14
+#undef JSON_HAS_CPP_17
+#undef JSON_HAS_CPP_20
+#undef JSON_HAS_FILESYSTEM
+#undef JSON_HAS_EXPERIMENTAL_FILESYSTEM
+#undef JSON_HAS_THREE_WAY_COMPARISON
+#undef JSON_HAS_RANGES
+#undef JSON_HAS_STATIC_RTTI
+#undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
+#endif
+
+// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
+//     __ _____ _____ _____
+//  __|  |   __|     |   | |  JSON for Modern C++
+// |  |  |__   |  |  | | | |  version 3.11.3
+// |_____|_____|_____|_|___|  https://github.com/nlohmann/json
+//
+// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
+// SPDX-License-Identifier: MIT
+
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#undef JSON_HEDLEY_ARM_VERSION
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#undef JSON_HEDLEY_ARRAY_PARAM
+#undef JSON_HEDLEY_ASSUME
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#undef JSON_HEDLEY_CONCAT
+#undef JSON_HEDLEY_CONCAT3
+#undef JSON_HEDLEY_CONCAT3_EX
+#undef JSON_HEDLEY_CONCAT_EX
+#undef JSON_HEDLEY_CONST
+#undef JSON_HEDLEY_CONSTEXPR
+#undef JSON_HEDLEY_CONST_CAST
+#undef JSON_HEDLEY_CPP_CAST
+#undef JSON_HEDLEY_CRAY_VERSION
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#undef JSON_HEDLEY_C_DECL
+#undef JSON_HEDLEY_DEPRECATED
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#undef JSON_HEDLEY_DMC_VERSION
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#undef JSON_HEDLEY_EMPTY_BASES
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#undef JSON_HEDLEY_END_C_DECLS
+#undef JSON_HEDLEY_FLAGS
+#undef JSON_HEDLEY_FLAGS_CAST
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#undef JSON_HEDLEY_GCC_VERSION
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#undef JSON_HEDLEY_GNUC_VERSION
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_BUILTIN
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_EXTENSION
+#undef JSON_HEDLEY_HAS_FEATURE
+#undef JSON_HEDLEY_HAS_WARNING
+#undef JSON_HEDLEY_IAR_VERSION
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#undef JSON_HEDLEY_IBM_VERSION
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#undef JSON_HEDLEY_IMPORT
+#undef JSON_HEDLEY_INLINE
+#undef JSON_HEDLEY_INTEL_CL_VERSION
+#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
+#undef JSON_HEDLEY_INTEL_VERSION
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#undef JSON_HEDLEY_IS_CONSTANT
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#undef JSON_HEDLEY_LIKELY
+#undef JSON_HEDLEY_MALLOC
+#undef JSON_HEDLEY_MCST_LCC_VERSION
+#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
+#undef JSON_HEDLEY_MESSAGE
+#undef JSON_HEDLEY_MSVC_VERSION
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#undef JSON_HEDLEY_NEVER_INLINE
+#undef JSON_HEDLEY_NON_NULL
+#undef JSON_HEDLEY_NO_ESCAPE
+#undef JSON_HEDLEY_NO_RETURN
+#undef JSON_HEDLEY_NO_THROW
+#undef JSON_HEDLEY_NULL
+#undef JSON_HEDLEY_PELLES_VERSION
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#undef JSON_HEDLEY_PGI_VERSION
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#undef JSON_HEDLEY_PREDICT
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#undef JSON_HEDLEY_PRIVATE
+#undef JSON_HEDLEY_PUBLIC
+#undef JSON_HEDLEY_PURE
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#undef JSON_HEDLEY_REQUIRE
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#undef JSON_HEDLEY_REQUIRE_MSG
+#undef JSON_HEDLEY_RESTRICT
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#undef JSON_HEDLEY_SENTINEL
+#undef JSON_HEDLEY_STATIC_ASSERT
+#undef JSON_HEDLEY_STATIC_CAST
+#undef JSON_HEDLEY_STRINGIFY
+#undef JSON_HEDLEY_STRINGIFY_EX
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#undef JSON_HEDLEY_TINYC_VERSION
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#undef JSON_HEDLEY_TI_ARMCL_VERSION
+#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL2000_VERSION
+#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL430_VERSION
+#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL6X_VERSION
+#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL7X_VERSION
+#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CLPRU_VERSION
+#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#undef JSON_HEDLEY_TI_VERSION
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#undef JSON_HEDLEY_UNAVAILABLE
+#undef JSON_HEDLEY_UNLIKELY
+#undef JSON_HEDLEY_UNPREDICTABLE
+#undef JSON_HEDLEY_UNREACHABLE
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#undef JSON_HEDLEY_VERSION
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#undef JSON_HEDLEY_VERSION_ENCODE
+#undef JSON_HEDLEY_WARNING
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#undef JSON_HEDLEY_FALL_THROUGH
+
+#endif  // INCLUDE_NLOHMANN_JSON_HPP_
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/utils/attn_score_modifiers.h b/third_party/cudnn-frontend/include/cudnn_frontend/utils/attn_score_modifiers.h
new file mode 100644
index 00000000..e50b69cd
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/utils/attn_score_modifiers.h
@@ -0,0 +1,399 @@
+#pragma once
+
+#include "../graph_interface.h"
+#include "../../cudnn_frontend_shim.h"
+
+namespace cudnn_frontend::graph::attn::score_modifiers {
+
+[[maybe_unused]] inline float
+get_negative_inf_value() {
+    // older cuDNN version prefers lowest for softmax mask
+    if (detail::get_backend_version() < 91000) {
+        return std::numeric_limits<float>::lowest();
+    }
+    return -std::numeric_limits<float>::infinity();
+}
+
+[[maybe_unused]] inline std::shared_ptr<Tensor_attributes>
+causal_mask(std::shared_ptr<Graph> graph, std::shared_ptr<Tensor_attributes> attention_score) {
+    return sliding_window_mask(graph,
+                               attention_score,
+                               DiagonalAlignment_t::TOP_LEFT,
+                               {},       // no left bound specified
+                               0,        // right bound = 0
+                               0,        // s_q does not matter for causal mask
+                               0,        // s_kv does not matter for causal mask
+                               nullptr,  // s_q does not matter for causal mask
+                               nullptr   // s_kv does not matter for causal mask
+    );
+}
+
+[[maybe_unused]] inline std::shared_ptr<Tensor_attributes>
+causal_mask_bottom_right(std::shared_ptr<Graph> graph,
+                         std::shared_ptr<Tensor_attributes> attention_score,
+                         int64_t s_q,
+                         int64_t s_kv,
+                         std::shared_ptr<Tensor_attributes> seq_len_q,
+                         std::shared_ptr<Tensor_attributes> seq_len_kv) {
+    return sliding_window_mask(graph,
+                               attention_score,
+                               DiagonalAlignment_t::BOTTOM_RIGHT,
+                               {},                    // no left bound specified
+                               0,                     // right bound = 0
+                               s_q,                   // s_q dimension (max Q sequence length)
+                               s_kv,                  // s_kv dimension (max KV sequence length)
+                               std::move(seq_len_q),  // Actuall Q sequence lengths
+                               std::move(seq_len_kv)  // Actual KV sequence lengths
+    );
+}
+
+[[maybe_unused]] inline std::shared_ptr<Tensor_attributes>
+padding_mask(std::shared_ptr<Graph> graph,
+             std::shared_ptr<Tensor_attributes> attention_score,
+             std::shared_ptr<Tensor_attributes> seq_len_kv,
+             std::shared_ptr<Tensor_attributes> seq_len_q) {
+    auto row_idx_output = graph->pointwise(attention_score,
+                                           Pointwise_attributes()
+                                               .set_name("gen_row_idx_padding")
+                                               .set_mode(PointwiseMode_t::GEN_INDEX)
+                                               .set_axis(2)
+                                               .set_compute_data_type(DataType_t::INT32));
+    row_idx_output->set_data_type(DataType_t::INT32);
+
+    auto col_idx_output = graph->pointwise(attention_score,
+                                           Pointwise_attributes()
+                                               .set_name("gen_col_idx_padding")
+                                               .set_mode(PointwiseMode_t::GEN_INDEX)
+                                               .set_axis(3)
+                                               .set_compute_data_type(DataType_t::INT32));
+    col_idx_output->set_data_type(DataType_t::INT32);
+
+    auto row_mask_output = graph->pointwise(row_idx_output,
+                                            seq_len_q,
+                                            Pointwise_attributes()
+                                                .set_name("lt_row_sq_padding")
+                                                .set_mode(PointwiseMode_t::CMP_LT)
+                                                .set_compute_data_type(DataType_t::BOOLEAN));
+    row_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+    auto col_mask_output = graph->pointwise(col_idx_output,
+                                            seq_len_kv,
+                                            Pointwise_attributes()
+                                                .set_name("lt_col_skv_padding")
+                                                .set_mode(PointwiseMode_t::CMP_LT)
+                                                .set_compute_data_type(DataType_t::BOOLEAN));
+    col_mask_output->set_data_type(DataType_t::BOOLEAN);
+
+    auto padding_mask_output = graph->pointwise(row_mask_output,
+                                                col_mask_output,
+                                                Pointwise_attributes()
+                                                    .set_name("and_row_col_padding")
+                                                    .set_mode(PointwiseMode_t::LOGICAL_AND)
+                                                    .set_compute_data_type(DataType_t::BOOLEAN));
+    padding_mask_output->set_data_type(DataType_t::BOOLEAN);
+    auto negative_inf_padding = std::make_shared<Tensor_attributes>(get_negative_inf_value());
+
+    auto after_padding_mask =
+        graph->pointwise(attention_score,
+                         negative_inf_padding,
+                         padding_mask_output,
+                         Pointwise_attributes().set_name("select_padding").set_mode(PointwiseMode_t::BINARY_SELECT));
+
+    return after_padding_mask;
+}
+
+[[maybe_unused]] inline std::shared_ptr<Tensor_attributes>
+alibi_mask(std::shared_ptr<Graph> graph,
+           std::shared_ptr<Tensor_attributes> attention_score,
+           std::shared_ptr<Tensor_attributes>& alibi_slopes,
+           int64_t query_heads,
+           int64_t& alibi_slopes_size) {
+    auto row_idx_output = graph->pointwise(attention_score,
+                                           Pointwise_attributes()
+                                               .set_name("gen_row_idx_alibi")
+                                               .set_mode(PointwiseMode_t::GEN_INDEX)
+                                               .set_axis(2)
+                                               .set_compute_data_type(DataType_t::INT32));
+    row_idx_output->set_data_type(DataType_t::INT32);
+
+    auto col_idx_output = graph->pointwise(attention_score,
+                                           Pointwise_attributes()
+                                               .set_name("gen_col_idx_alibi")
+                                               .set_mode(PointwiseMode_t::GEN_INDEX)
+                                               .set_axis(3)
+                                               .set_compute_data_type(DataType_t::INT32));
+    col_idx_output->set_data_type(DataType_t::INT32);
+
+    auto sub_idx_output = graph->pointwise(col_idx_output,
+                                           row_idx_output,
+                                           Pointwise_attributes()
+                                               .set_name("sub_col_row_alibi")
+                                               .set_mode(PointwiseMode_t::SUB)
+                                               .set_compute_data_type(DataType_t::INT32));
+    sub_idx_output->set_data_type(DataType_t::INT32);
+
+    // Multiply by alibi slope
+    alibi_slopes = std::make_shared<Tensor_attributes>();
+    alibi_slopes->set_dim({1, query_heads, 1, 1}).set_stride({query_heads, 1, 1, 1}).set_data_type(DataType_t::FLOAT);
+    alibi_slopes_size = query_heads * sizeof(float);
+
+    auto alibi_mask_output =
+        graph->pointwise(sub_idx_output,
+                         alibi_slopes,
+                         Pointwise_attributes().set_name("mul_slope_alibi").set_mode(PointwiseMode_t::MUL));
+
+    auto after_alibi_mask =
+        graph->pointwise(attention_score,
+                         alibi_mask_output,
+                         Pointwise_attributes().set_name("add_alibi").set_mode(PointwiseMode_t::ADD));
+    return after_alibi_mask;
+}
+
+[[maybe_unused]] inline std::shared_ptr<Tensor_attributes>
+bias(std::shared_ptr<Graph> graph,
+     std::shared_ptr<Tensor_attributes> attention_score,
+     std::shared_ptr<Tensor_attributes> bias) {
+    auto bias_out = graph->pointwise(
+        attention_score, bias, Pointwise_attributes().set_name("bias_add").set_mode(PointwiseMode_t::ADD));
+
+    return bias_out;
+}
+
+[[maybe_unused]] inline std::shared_ptr<Tensor_attributes>
+sliding_window_mask(std::shared_ptr<Graph> graph,
+                    std::shared_ptr<Tensor_attributes> attention_score,
+                    DiagonalAlignment_t diagonal_alignment,
+                    std::optional<int64_t> left_bound,
+                    std::optional<int64_t> right_bound,
+                    int64_t s_q,
+                    int64_t s_kv,
+                    std::shared_ptr<Tensor_attributes> s_q_ptr,
+                    std::shared_ptr<Tensor_attributes> s_kv_ptr) {
+    std::shared_ptr<Tensor_attributes> return_mask = attention_score;
+
+    // Note: the right and left bound subtrees can be constructed in different ways as well that yield functionally
+    // correct results. However, for performance reasons in the cuDNN backend they are organized as they are. Be
+    // cautious of performance when editting.
+
+    // Set the right bound
+    if (right_bound.has_value()) {
+        auto row_index_attributes =
+            Pointwise_attributes().set_name("gen_row_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(2);
+        std::shared_ptr<Tensor_attributes> row_index = graph->pointwise(attention_score, row_index_attributes);
+        row_index->set_data_type(DataType_t::INT32);
+
+        auto col_index_attributes =
+            Pointwise_attributes().set_name("gen_col_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(3);
+        std::shared_ptr<Tensor_attributes> col_index = graph->pointwise(attention_score, col_index_attributes);
+        col_index->set_data_type(DataType_t::INT32);
+
+        if (diagonal_alignment == DiagonalAlignment_t::BOTTOM_RIGHT) {
+            row_index = graph->pointwise(
+                row_index,
+                // Use actual sequence lengths if they are provided
+                s_kv_ptr != nullptr ? s_kv_ptr : std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_kv)),
+                Pointwise_attributes()
+                    .set_name("row_idx_add_skv")
+                    .set_mode(PointwiseMode_t::ADD)
+                    .set_compute_data_type(DataType_t::INT32));
+
+            row_index->set_data_type(DataType_t::INT32);
+
+            row_index = graph->pointwise(
+                row_index,
+                // Use actual sequence lengths if they are provided
+                s_q_ptr != nullptr ? s_q_ptr : std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_q)),
+                Pointwise_attributes()
+                    .set_name("row_idx_add_skv_sub_sq")
+                    .set_mode(PointwiseMode_t::SUB)
+                    .set_compute_data_type(DataType_t::INT32));
+            row_index->set_data_type(DataType_t::INT32);
+        }
+
+        // Shift the diagonal in case there is a non-zero right bound
+        if (right_bound.value() != 0) {
+            row_index = graph->pointwise(row_index,
+                                         std::make_shared<Tensor_attributes>(static_cast<int32_t>(right_bound.value())),
+                                         Pointwise_attributes()
+                                             .set_name("row_idx_add_skv_sub_sq_add_right_bound")
+                                             .set_mode(PointwiseMode_t::ADD)
+                                             .set_compute_data_type(DataType_t::INT32));
+            row_index->set_data_type(DataType_t::INT32);
+        }
+
+        auto const& bool_mask = graph->pointwise(row_index,
+                                                 col_index,
+                                                 Pointwise_attributes()
+                                                     .set_name("row_greater_than_col")
+                                                     .set_mode(PointwiseMode_t::CMP_GE)
+                                                     .set_compute_data_type(DataType_t::BOOLEAN));
+        bool_mask->set_data_type(DataType_t::BOOLEAN);
+
+        return_mask =
+            graph->pointwise(attention_score,
+                             std::make_shared<Tensor_attributes>(get_negative_inf_value()),
+                             bool_mask,
+                             Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT));
+    }
+
+    // Set the left bound
+    if (left_bound.has_value()) {
+        auto row_index_attributes =
+            Pointwise_attributes().set_name("gen_row_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(2);
+        std::shared_ptr<Tensor_attributes> row_index_output = graph->pointwise(return_mask, row_index_attributes);
+
+        auto col_index_attributes =
+            Pointwise_attributes().set_name("gen_col_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(3);
+        std::shared_ptr<Tensor_attributes> col_index_output = graph->pointwise(return_mask, col_index_attributes);
+        // When the diagonal is top left aligned: setup a graph so we can compare column + window_size > row
+        // All elements for which column + window_size > row, will be retained, all others will be masked out
+        // Note that here and following sections, row refers to the s_q index and column refers to the s_kv index in
+        // the s_q x s_kv masking matrix
+        if (diagonal_alignment == DiagonalAlignment_t::TOP_LEFT) {
+            // sliding window length parameter should be of float type
+            auto sliding_window_length = std::make_shared<Tensor_attributes>((float)left_bound.value());
+            auto add_col_attributes    = Pointwise_attributes()
+                                          .set_name("col+window")
+                                          .set_mode(PointwiseMode_t::ADD)
+                                          .set_compute_data_type(DataType_t::FLOAT)
+                                          .set_axis(3);
+
+            col_index_output = graph->pointwise(col_index_output, sliding_window_length, add_col_attributes);
+        }
+        // With bottom right diagonal alignment, we need to shift the diagonal.
+        // Setup a graph so we can compare column + window_size - (s_kv - s_q) > row
+        // Optimization with fixed sequence lengths: single pointwise addition for the left-hand of the comparison
+        // Again, all elements satisfying the comparison will be retained.
+        else if (s_kv_ptr == nullptr && s_q_ptr == nullptr) {
+            auto sliding_window_length = std::make_shared<Tensor_attributes>((float)(left_bound.value() - s_kv + s_q));
+            auto add_col_attributes    = Pointwise_attributes()
+                                          .set_name("col+window-skv+sq")
+                                          .set_mode(PointwiseMode_t::ADD)
+                                          .set_compute_data_type(DataType_t::FLOAT)
+                                          .set_axis(3);
+
+            col_index_output = graph->pointwise(col_index_output, sliding_window_length, add_col_attributes);
+        }
+        // With bottom right diagonal alignment: general case when at least one of Q and KV have variable sequence
+        // lengths.
+        // Setup a graph so we can compare column + window_size - (s_k[i] - s_q[i]) > row  for each batch i
+        // Also here, all elements satisfying the comparison will be retained.
+        else {
+            col_index_output->set_data_type(DataType_t::INT32);
+            row_index_output->set_data_type(DataType_t::INT32);
+
+            auto sliding_window_length = std::make_shared<Tensor_attributes>((int32_t)left_bound.value());
+            auto add_col_attributes    = Pointwise_attributes()
+                                          .set_name("col+window")
+                                          .set_mode(PointwiseMode_t::ADD)
+                                          .set_compute_data_type(DataType_t::INT32)
+                                          .set_axis(3);
+
+            col_index_output = graph->pointwise(col_index_output, sliding_window_length, add_col_attributes);
+            col_index_output->set_data_type(DataType_t::INT32);
+
+            if (s_kv_ptr) {
+                col_index_output = graph->pointwise(col_index_output,
+                                                    s_kv_ptr,
+                                                    Pointwise_attributes()
+                                                        .set_name("col+window-skv")
+                                                        .set_mode(PointwiseMode_t::SUB)
+                                                        .set_compute_data_type(DataType_t::INT32));
+            } else {
+                col_index_output = graph->pointwise(col_index_output,
+                                                    std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_kv)),
+                                                    Pointwise_attributes()
+                                                        .set_name("col+window-skv")
+                                                        .set_mode(PointwiseMode_t::SUB)
+                                                        .set_compute_data_type(DataType_t::INT32));
+            }
+            col_index_output->set_data_type(DataType_t::INT32);
+
+            if (s_q_ptr) {
+                col_index_output = graph->pointwise(col_index_output,
+                                                    s_q_ptr,
+                                                    Pointwise_attributes()
+                                                        .set_name("col+window-skv+sq")
+                                                        .set_mode(PointwiseMode_t::ADD)
+                                                        .set_compute_data_type(DataType_t::INT32));
+            } else {
+                col_index_output = graph->pointwise(col_index_output,
+                                                    std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_q)),
+                                                    Pointwise_attributes()
+                                                        .set_name("col+window-skv+sq")
+                                                        .set_mode(PointwiseMode_t::ADD)
+                                                        .set_compute_data_type(DataType_t::INT32));
+            }
+            col_index_output->set_data_type(DataType_t::INT32);
+        }
+
+        auto greater_than_attributes =
+            Pointwise_attributes().set_mode(PointwiseMode_t::CMP_GT).set_compute_data_type(DataType_t::BOOLEAN);
+
+        if (diagonal_alignment == DiagonalAlignment_t::TOP_LEFT) {
+            greater_than_attributes.set_name("col+ws>row");
+        } else {
+            greater_than_attributes.set_name("col+window-skv+sq>row");
+        }
+
+        auto swa_comparison_output = graph->pointwise(col_index_output, row_index_output, greater_than_attributes);
+        swa_comparison_output->set_data_type(DataType_t::BOOLEAN);
+
+        return_mask =
+            graph->pointwise(return_mask,
+                             std::make_shared<Tensor_attributes>(get_negative_inf_value()),
+                             swa_comparison_output,
+                             Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT));
+    }
+    return return_mask;
+}
+
+class Softcap {
+   private:
+    // saved tensors in fprop to be used in bprop
+
+    std::shared_ptr<Tensor_attributes> before_tanh_activation;
+
+   public:
+    std::shared_ptr<Tensor_attributes>
+    forward(std::shared_ptr<Graph> graph,
+            std::shared_ptr<Tensor_attributes> attention_score,
+            std::shared_ptr<Tensor_attributes> soft_cap_scalar) {
+        before_tanh_activation =
+            graph->pointwise(attention_score,
+                             soft_cap_scalar,
+                             Pointwise_attributes().set_name("div_by_soft_cap").set_mode(PointwiseMode_t::DIV));
+
+        auto tanh_out = graph->pointwise(
+            before_tanh_activation, Pointwise_attributes().set_name("activation").set_mode(PointwiseMode_t::TANH_FWD));
+
+        auto out = graph->pointwise(tanh_out,
+                                    soft_cap_scalar,
+                                    Pointwise_attributes().set_name("mul_by_soft_cap").set_mode(PointwiseMode_t::MUL));
+
+        return out;
+    }
+
+    std::shared_ptr<Tensor_attributes>
+    backward(std::shared_ptr<Graph> graph,
+             std::shared_ptr<Tensor_attributes> attention_score,
+             std::shared_ptr<Tensor_attributes> soft_cap_scalar) {
+        auto mul_out =
+            graph->pointwise(attention_score,
+                             soft_cap_scalar,
+                             Pointwise_attributes().set_name("mul_by_soft_cap_bprop").set_mode(PointwiseMode_t::MUL));
+
+        auto tanh_out = graph->pointwise(mul_out,
+                                         before_tanh_activation,
+                                         Pointwise_attributes().set_name("dtanh").set_mode(PointwiseMode_t::TANH_BWD));
+
+        auto out =
+            graph->pointwise(tanh_out,
+                             soft_cap_scalar,
+                             Pointwise_attributes().set_name("div_by_soft_cap_bprop").set_mode(PointwiseMode_t::DIV));
+
+        return out;
+    }
+};
+
+}  // namespace cudnn_frontend::graph::attn::score_modifiers
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend/utils/serialize.h b/third_party/cudnn-frontend/include/cudnn_frontend/utils/serialize.h
new file mode 100644
index 00000000..b4e6a334
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend/utils/serialize.h
@@ -0,0 +1,567 @@
+#pragma once
+
+#include "../graph_properties.h"
+#include "../graph_helpers.h"
+
+namespace cudnn_frontend::graph {
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+NLOHMANN_JSON_SERIALIZE_ENUM(BN_finalize_attributes::input_names,
+                             {
+                                 {BN_finalize_attributes::input_names::SUM, "SUM"},
+                                 {BN_finalize_attributes::input_names::SQ_SUM, "SQ_SUM"},
+                                 {BN_finalize_attributes::input_names::SCALE, "SCALE"},
+                                 {BN_finalize_attributes::input_names::BIAS, "BIAS"},
+                                 {BN_finalize_attributes::input_names::EPSILON, "EPSILON"},
+                                 {BN_finalize_attributes::input_names::ACCUM_COUNT, "ACCUM_COUNT"},
+                                 {BN_finalize_attributes::input_names::PREV_RUNNING_MEAN, "PREV_RUNNING_MEAN"},
+                                 {BN_finalize_attributes::input_names::PREV_RUNNING_VAR, "PREV_RUNNING_VAR"},
+                                 {BN_finalize_attributes::input_names::MOMENTUM, "MOMENTUM"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(BN_finalize_attributes::output_names,
+                             {
+                                 {BN_finalize_attributes::output_names::EQ_SCALE, "EQ_SCALE"},
+                                 {BN_finalize_attributes::output_names::EQ_BIAS, "EQ_BIAS"},
+                                 {BN_finalize_attributes::output_names::MEAN, "MEAN"},
+                                 {BN_finalize_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                                 {BN_finalize_attributes::output_names::NEXT_RUNNING_MEAN, "NEXT_RUNNING_MEAN"},
+                                 {BN_finalize_attributes::output_names::NEXT_RUNNING_VAR, "NEXT_RUNNING_VAR"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_attributes::input_names,
+                             {
+                                 {Batchnorm_attributes::input_names::X, "X"},
+                                 {Batchnorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Batchnorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Batchnorm_attributes::input_names::EPSILON, "EPSILON"},
+                                 {Batchnorm_attributes::input_names::PREV_RUNNING_MEAN, "PREV_RUNNING_MEAN"},
+                                 {Batchnorm_attributes::input_names::PREV_RUNNING_VAR, "PREV_RUNNING_VAR"},
+                                 {Batchnorm_attributes::input_names::MOMENTUM, "MOMENTUM"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_attributes::output_names,
+                             {
+                                 {Batchnorm_attributes::output_names::Y, "Y"},
+                                 {Batchnorm_attributes::output_names::MEAN, "MEAN"},
+                                 {Batchnorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                                 {Batchnorm_attributes::output_names::NEXT_RUNNING_MEAN, "NEXT_RUNNING_MEAN"},
+                                 {Batchnorm_attributes::output_names::NEXT_RUNNING_VAR, "NEXT_RUNNING_VAR"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_backward_attributes::input_names,
+                             {
+                                 {Batchnorm_backward_attributes::input_names::DY, "DY"},
+                                 {Batchnorm_backward_attributes::input_names::X, "X"},
+                                 {Batchnorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Batchnorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {Batchnorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_backward_attributes::output_names,
+                             {
+                                 {Batchnorm_backward_attributes::output_names::DX, "DX"},
+                                 {Batchnorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Batchnorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_inference_attributes::input_names,
+                             {
+                                 {Batchnorm_inference_attributes::input_names::X, "X"},
+                                 {Batchnorm_inference_attributes::input_names::SCALE, "SCALE"},
+                                 {Batchnorm_inference_attributes::input_names::BIAS, "BIAS"},
+                                 {Batchnorm_inference_attributes::input_names::MEAN, "MEAN"},
+                                 {Batchnorm_inference_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Batchnorm_inference_attributes::output_names,
+                             {{Batchnorm_inference_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_dgrad_attributes::input_names,
+                             {
+                                 {Conv_dgrad_attributes::input_names::W, "W"},
+                                 {Conv_dgrad_attributes::input_names::DY, "DY"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_dgrad_attributes::output_names,
+                             {
+                                 {Conv_dgrad_attributes::output_names::DX, "DX"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_fprop_attributes::input_names,
+                             {
+                                 {Conv_fprop_attributes::input_names::X, "X"},
+                                 {Conv_fprop_attributes::input_names::W, "W"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_fprop_attributes::output_names,
+                             {
+                                 {Conv_fprop_attributes::output_names::Y, "Y"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_wgrad_attributes::input_names,
+                             {
+                                 {Conv_wgrad_attributes::input_names::DY, "DY"},
+                                 {Conv_wgrad_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Conv_wgrad_attributes::output_names,
+                             {
+                                 {Conv_wgrad_attributes::output_names::DW, "DW"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(DBN_weight_attributes::input_names,
+                             {
+                                 {DBN_weight_attributes::input_names::DY, "DY"},
+                                 {DBN_weight_attributes::input_names::X, "X"},
+                                 {DBN_weight_attributes::input_names::SCALE, "SCALE"},
+                                 {DBN_weight_attributes::input_names::MEAN, "MEAN"},
+                                 {DBN_weight_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(DBN_weight_attributes::output_names,
+                             {
+                                 {DBN_weight_attributes::output_names::DSCALE, "DSCALE"},
+                                 {DBN_weight_attributes::output_names::DBIAS, "DBIAS"},
+                                 {DBN_weight_attributes::output_names::EQ_BIAS, "EQ_BIAS"},
+                                 {DBN_weight_attributes::output_names::EQ_SCALE_DY, "EQ_SCALE_DY"},
+                                 {DBN_weight_attributes::output_names::EQ_SCALE_X, "EQ_SCALE_X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Genstats_attributes::input_names,
+                             {
+                                 {Genstats_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Genstats_attributes::output_names,
+                             {
+                                 {Genstats_attributes::output_names::SUM, "SUM"},
+                                 {Genstats_attributes::output_names::SQ_SUM, "SQ_SUM"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_attributes::input_names,
+                             {
+                                 {Instancenorm_attributes::input_names::X, "X"},
+                                 {Instancenorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Instancenorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Instancenorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_attributes::output_names,
+                             {
+                                 {Instancenorm_attributes::output_names::Y, "Y"},
+                                 {Instancenorm_attributes::output_names::MEAN, "MEAN"},
+                                 {Instancenorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_backward_attributes::input_names,
+                             {
+                                 {Instancenorm_backward_attributes::input_names::DY, "DY"},
+                                 {Instancenorm_backward_attributes::input_names::X, "X"},
+                                 {Instancenorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Instancenorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {Instancenorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Instancenorm_backward_attributes::output_names,
+                             {
+                                 {Instancenorm_backward_attributes::output_names::DX, "DX"},
+                                 {Instancenorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Instancenorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_attributes::input_names,
+                             {
+                                 {Layernorm_attributes::input_names::X, "X"},
+                                 {Layernorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Layernorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Layernorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_attributes::output_names,
+                             {
+                                 {Layernorm_attributes::output_names::Y, "Y"},
+                                 {Layernorm_attributes::output_names::MEAN, "MEAN"},
+                                 {Layernorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_backward_attributes::input_names,
+                             {
+                                 {Layernorm_backward_attributes::input_names::DY, "DY"},
+                                 {Layernorm_backward_attributes::input_names::X, "X"},
+                                 {Layernorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Layernorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {Layernorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Layernorm_backward_attributes::output_names,
+                             {
+                                 {Layernorm_backward_attributes::output_names::DX, "DX"},
+                                 {Layernorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Layernorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(AdaLayernorm_attributes::input_names,
+                             {
+                                 {AdaLayernorm_attributes::input_names::X, "X"},
+                                 {AdaLayernorm_attributes::input_names::SCALE, "SCALE"},
+                                 {AdaLayernorm_attributes::input_names::BIAS, "BIAS"},
+                                 {AdaLayernorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(AdaLayernorm_attributes::output_names,
+                             {
+                                 {AdaLayernorm_attributes::output_names::Y, "Y"},
+                                 {AdaLayernorm_attributes::output_names::MEAN, "MEAN"},
+                                 {AdaLayernorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(AdaLayernorm_backward_attributes::input_names,
+                             {
+                                 {AdaLayernorm_backward_attributes::input_names::DY, "DY"},
+                                 {AdaLayernorm_backward_attributes::input_names::X, "X"},
+                                 {AdaLayernorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {AdaLayernorm_backward_attributes::input_names::MEAN, "MEAN"},
+                                 {AdaLayernorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(AdaLayernorm_backward_attributes::output_names,
+                             {
+                                 {AdaLayernorm_backward_attributes::output_names::DX, "DX"},
+                                 {AdaLayernorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {AdaLayernorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_attributes::input_names,
+                             {
+                                 {Matmul_attributes::input_names::A, "A"},
+                                 {Matmul_attributes::input_names::B, "B"},
+                                 {Matmul_attributes::input_names::M_override, "M_override"},
+                                 {Matmul_attributes::input_names::N_override, "N_override"},
+                                 {Matmul_attributes::input_names::K_override, "K_override"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_attributes::output_names,
+                             {
+                                 {Matmul_attributes::output_names::C, "C"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_fp8_attributes::input_names,
+                             {
+                                 {Matmul_fp8_attributes::input_names::A, "A"},
+                                 {Matmul_fp8_attributes::input_names::B, "B"},
+                                 {Matmul_fp8_attributes::input_names::Descale_A, "Descale_A"},
+                                 {Matmul_fp8_attributes::input_names::Descale_B, "Descale_B"},
+                                 {Matmul_fp8_attributes::input_names::M_override, "M_override"},
+                                 {Matmul_fp8_attributes::input_names::N_override, "N_override"},
+                                 {Matmul_fp8_attributes::input_names::K_override, "K_override"},
+                                 {Matmul_fp8_attributes::input_names::Scale_C, "Scale_C"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Matmul_fp8_attributes::output_names,
+                             {
+                                 {Matmul_fp8_attributes::output_names::C, "C"},
+                                 {Matmul_fp8_attributes::output_names::Amax_C, "Amax_C"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Pointwise_attributes::input_names,
+                             {
+                                 {Pointwise_attributes::input_names::IN_0, "IN_0"},
+                                 {Pointwise_attributes::input_names::IN_1, "IN_1"},
+                                 {Pointwise_attributes::input_names::IN_2, "IN_2"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Pointwise_attributes::output_names,
+                             {
+                                 {Pointwise_attributes::output_names::OUT_0, "OUT_0"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reduction_attributes::input_names,
+                             {
+                                 {Reduction_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reduction_attributes::output_names, {{Reduction_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Resample_attributes::input_names,
+                             {
+                                 {Resample_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Resample_attributes::output_names,
+                             {{Resample_attributes::output_names::Y, "Y"},
+                              {Resample_attributes::output_names::Index, "Index"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reshape_attributes::input_names,
+                             {
+                                 {Reshape_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Reshape_attributes::output_names, {{Reshape_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_attributes::input_names,
+                             {
+                                 {Rmsnorm_attributes::input_names::X, "X"},
+                                 {Rmsnorm_attributes::input_names::SCALE, "SCALE"},
+                                 {Rmsnorm_attributes::input_names::BIAS, "BIAS"},
+                                 {Rmsnorm_attributes::input_names::EPSILON, "EPSILON"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_attributes::output_names,
+                             {
+                                 {Rmsnorm_attributes::output_names::Y, "Y"},
+                                 {Rmsnorm_attributes::output_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_backward_attributes::input_names,
+                             {
+                                 {Rmsnorm_backward_attributes::input_names::DY, "DY"},
+                                 {Rmsnorm_backward_attributes::input_names::X, "X"},
+                                 {Rmsnorm_backward_attributes::input_names::SCALE, "SCALE"},
+                                 {Rmsnorm_backward_attributes::input_names::INV_VARIANCE, "INV_VARIANCE"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rmsnorm_backward_attributes::output_names,
+                             {
+                                 {Rmsnorm_backward_attributes::output_names::DX, "DX"},
+                                 {Rmsnorm_backward_attributes::output_names::DSCALE, "DSCALE"},
+                                 {Rmsnorm_backward_attributes::output_names::DBIAS, "DBIAS"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rng_attributes::input_names,
+                             {
+                                 {Rng_attributes::input_names::Seed, "Seed"},
+                                 {Rng_attributes::input_names::Offset, "Offset"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Rng_attributes::output_names, {{Rng_attributes::output_names::Y, "Y"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(PagedCacheLoad_attributes::input_names,
+                             {
+                                 {PagedCacheLoad_attributes::input_names::container, "container"},
+                                 {PagedCacheLoad_attributes::input_names::seqLen, "seqLen"},
+                                 {PagedCacheLoad_attributes::input_names::pageTable, "pageTable"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(PagedCacheLoad_attributes::output_names,
+                             {
+                                 {PagedCacheLoad_attributes::output_names::yOut, "yOut"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_attributes::input_names,
+                             {
+                                 {SDPA_attributes::input_names::Q, "Q"},
+                                 {SDPA_attributes::input_names::K, "K"},
+                                 {SDPA_attributes::input_names::V, "V"},
+                                 {SDPA_attributes::input_names::Attn_scale, "Attn_scale"},
+                                 {SDPA_attributes::input_names::Bias, "Bias"},
+                                 {SDPA_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"},
+                                 {SDPA_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"},
+                                 {SDPA_attributes::input_names::Seed, "Seed"},
+                                 {SDPA_attributes::input_names::Offset, "Offset"},
+                                 {SDPA_attributes::input_names::Dropout_mask, "Dropout_mask"},
+                                 {SDPA_attributes::input_names::Dropout_scale, "Dropout_scale"},
+                                 {SDPA_attributes::input_names::Page_table_K, "Page_table_K"},
+                                 {SDPA_attributes::input_names::Page_table_V, "Page_table_V"},
+                                 // FP8-specific inputs
+                                 {SDPA_attributes::input_names::Descale_Q, "Descale_Q"},
+                                 {SDPA_attributes::input_names::Descale_K, "Descale_K"},
+                                 {SDPA_attributes::input_names::Descale_V, "Descale_V"},
+                                 {SDPA_attributes::input_names::Descale_S, "Descale_S"},
+                                 {SDPA_attributes::input_names::Scale_S, "Scale_S"},
+                                 {SDPA_attributes::input_names::Scale_O, "Scale_O"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_attributes::output_names,
+                             {{SDPA_attributes::output_names::O, "O"},
+                              {SDPA_attributes::output_names::Stats, "Stats"},
+                              {SDPA_attributes::output_names::RNG_DUMP, "RNG_DUMP"},
+                              {SDPA_attributes::output_names::Amax_S, "Amax_S"},
+                              {SDPA_attributes::output_names::Amax_O, "Amax_O"}})
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_backward_attributes::input_names,
+                             {
+                                 {SDPA_backward_attributes::input_names::Q, "Q"},
+                                 {SDPA_backward_attributes::input_names::K, "K"},
+                                 {SDPA_backward_attributes::input_names::V, "V"},
+                                 {SDPA_backward_attributes::input_names::O, "O"},
+                                 {SDPA_backward_attributes::input_names::dO, "dO"},
+                                 {SDPA_backward_attributes::input_names::Stats, "Stats"},
+                                 {SDPA_backward_attributes::input_names::Attn_scale, "Attn_scale"},
+                                 {SDPA_backward_attributes::input_names::Bias, "Bias"},
+                                 {SDPA_backward_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"},
+                                 {SDPA_backward_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"},
+                                 {SDPA_backward_attributes::input_names::Seed, "Seed"},
+                                 {SDPA_backward_attributes::input_names::Offset, "Offset"},
+                                 {SDPA_backward_attributes::input_names::Dropout_mask, "Dropout_mask"},
+                                 {SDPA_backward_attributes::input_names::Dropout_scale, "Dropout_scale"},
+                                 {SDPA_backward_attributes::input_names::Dropout_scale_inv, "Dropout_scale_inv"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_backward_attributes::output_names,
+                             {
+                                 {SDPA_backward_attributes::output_names::dQ, "dQ"},
+                                 {SDPA_backward_attributes::output_names::dK, "dK"},
+                                 {SDPA_backward_attributes::output_names::dV, "dV"},
+                                 {SDPA_backward_attributes::output_names::dBias, "dBias"},
+                                 {SDPA_backward_attributes::output_names::RNG_DUMP, "RNG_DUMP"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Slice_attributes::output_names,
+                             {
+                                 {Slice_attributes::output_names::Y, "Y"},
+                             })
+NLOHMANN_JSON_SERIALIZE_ENUM(Slice_attributes::input_names,
+                             {
+                                 {Slice_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_fp8_backward_attributes::input_names,
+                             {
+                                 {SDPA_fp8_backward_attributes::input_names::Q, "Q"},
+                                 {SDPA_fp8_backward_attributes::input_names::K, "K"},
+                                 {SDPA_fp8_backward_attributes::input_names::V, "V"},
+                                 {SDPA_fp8_backward_attributes::input_names::O, "O"},
+                                 {SDPA_fp8_backward_attributes::input_names::dO, "dO"},
+                                 {SDPA_fp8_backward_attributes::input_names::Stats, "Stats"},
+                                 {SDPA_fp8_backward_attributes::input_names::Attn_scale, "Attn_scale"},
+                                 {SDPA_fp8_backward_attributes::input_names::Bias, "Bias"},
+                                 {SDPA_fp8_backward_attributes::input_names::SEQ_LEN_Q, "SEQ_LEN_Q"},
+                                 {SDPA_fp8_backward_attributes::input_names::SEQ_LEN_KV, "SEQ_LEN_KV"},
+                                 {SDPA_fp8_backward_attributes::input_names::Seed, "Seed"},
+                                 {SDPA_fp8_backward_attributes::input_names::Offset, "Offset"},
+                                 {SDPA_fp8_backward_attributes::input_names::Dropout_mask, "Dropout_mask"},
+                                 {SDPA_fp8_backward_attributes::input_names::Dropout_scale, "Dropout_scale"},
+                                 {SDPA_fp8_backward_attributes::input_names::Dropout_scale_inv, "Dropout_scale_inv"},
+
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_Q, "Descale_Q"},
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_K, "Descale_K"},
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_V, "Descale_V"},
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_O, "Descale_O"},
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_dO, "Descale_dO"},
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_S, "Descale_S"},
+                                 {SDPA_fp8_backward_attributes::input_names::Descale_dP, "Descale_dP"},
+                                 {SDPA_fp8_backward_attributes::input_names::Scale_dQ, "Scale_dQ"},
+                                 {SDPA_fp8_backward_attributes::input_names::Scale_dK, "Scale_dK"},
+                                 {SDPA_fp8_backward_attributes::input_names::Scale_dV, "Scale_dV"},
+                                 {SDPA_fp8_backward_attributes::input_names::Scale_S, "Scale_S"},
+                                 {SDPA_fp8_backward_attributes::input_names::Scale_dP, "Scale_dP"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(SDPA_fp8_backward_attributes::output_names,
+                             {
+                                 {SDPA_fp8_backward_attributes::output_names::dQ, "dQ"},
+                                 {SDPA_fp8_backward_attributes::output_names::dK, "dK"},
+                                 {SDPA_fp8_backward_attributes::output_names::dV, "dV"},
+                                 {SDPA_fp8_backward_attributes::output_names::Amax_dQ, "Amax_dQ"},
+                                 {SDPA_fp8_backward_attributes::output_names::Amax_dK, "Amax_dK"},
+                                 {SDPA_fp8_backward_attributes::output_names::Amax_dV, "Amax_dV"},
+                                 {SDPA_fp8_backward_attributes::output_names::Amax_dP, "Amax_d"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Block_scale_quantize_attributes::input_names,
+                             {
+                                 {Block_scale_quantize_attributes::input_names::X, "X"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Block_scale_quantize_attributes::output_names,
+                             {
+                                 {Block_scale_quantize_attributes::output_names::Y, "Y"},
+                                 {Block_scale_quantize_attributes::output_names::scale, "scale"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Block_scale_dequantize_attributes::input_names,
+                             {
+                                 {Block_scale_dequantize_attributes::input_names::X, "X"},
+                                 {Block_scale_dequantize_attributes::input_names::scale, "scale"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Block_scale_dequantize_attributes::output_names,
+                             {
+                                 {Block_scale_dequantize_attributes::output_names::Y, "Y"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Concatenate_attributes::output_names,
+                             {
+                                 {Concatenate_attributes::output_names::Y, "Y"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Moe_grouped_matmul_attributes::input_names,
+                             {
+                                 {Moe_grouped_matmul_attributes::input_names::Token, "Token"},
+                                 {Moe_grouped_matmul_attributes::input_names::Weight, "Weight"},
+                                 {Moe_grouped_matmul_attributes::input_names::FirstTokenOffset, "FirstTokenOffset"},
+                                 {Moe_grouped_matmul_attributes::input_names::TokenIndex, "TokenIndex"},
+                                 {Moe_grouped_matmul_attributes::input_names::TokenKs, "TokenKs"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(Moe_grouped_matmul_attributes::output_names,
+                             {
+                                 {Moe_grouped_matmul_attributes::output_names::Output, "Output"},
+                             })
+
+inline void
+to_json(nlohmann::json& j, const Tensor_attributes& ta) {
+    j = nlohmann::json{{"name", ta.name},
+                       {"data_type", ta.data_type},
+                       {"dim", ta.dim},
+                       {"stride", ta.stride},
+                       {"is_virtual", ta.is_virtual},
+                       {"pass_by_value", ta.pass_by_value},
+                       {"is_pass_by_value", ta.is_pass_by_value},
+                       {"reordering_type", ta.reordering_type},
+                       {"uid", ta.uid},
+                       {"uid_assigned", ta.uid_assigned}};
+}
+
+inline void
+from_json(const nlohmann::json& j, Tensor_attributes& ta) {
+    ta.name             = j.at("name").get<std::string>();
+    ta.data_type        = j.at("data_type").get<DataType_t>();
+    ta.dim              = j.at("dim").get<std::vector<int64_t>>();
+    ta.stride           = j.at("stride").get<std::vector<int64_t>>();
+    ta.is_virtual       = j.at("is_virtual").get<bool>();
+    ta.is_pass_by_value = j.at("is_pass_by_value").get<bool>();
+    ta.reordering_type  = j.at("reordering_type").get<TensorReordering_t>();
+    ta.uid              = j.at("uid").get<Tensor_attributes::uid_t>();
+    ta.uid_assigned     = j.at("uid_assigned").get<bool>();
+
+    if (ta.is_pass_by_value && !j["pass_by_value"].is_null()) {
+        ta.pass_by_value = j.at("pass_by_value");
+    }
+}
+
+NLOHMANN_JSON_SERIALIZE_ENUM(KnobType_t,
+                             {
+                                 {KnobType_t::NOT_SET, nullptr},
+                                 {KnobType_t::SWIZZLE, "SWIZZLE"},
+                                 {KnobType_t::TILE_SIZE, "TILE_SIZE"},
+                                 {KnobType_t::EDGE, "EDGE"},
+                                 {KnobType_t::MULTIPLY, "MULTIPLY"},
+                                 {KnobType_t::SPLIT_K_BUF, "SPLIT_K_BUF"},
+                                 {KnobType_t::TILEK, "TILEK"},
+                                 {KnobType_t::STAGES, "STAGES"},
+                                 {KnobType_t::REDUCTION_MODE, "REDUCTION_MODE"},
+                                 {KnobType_t::SPLIT_K_SLC, "SPLIT_K_SLC"},
+                                 {KnobType_t::IDX_MODE, "IDX_MODE"},
+                                 {KnobType_t::SPECFILT, "SPECFILT"},
+                                 {KnobType_t::KERNEL_CFG, "KERNEL_CFG"},
+                                 {KnobType_t::WORKSPACE, "WORKSPACE"},
+                                 {KnobType_t::TILE_CGA_M, "TILE_CGA_M"},
+                                 {KnobType_t::TILE_CGA_N, "TILE_CGA_N"},
+                                 {KnobType_t::BLOCK_SIZE, "BLOCK_SIZE"},
+                                 {KnobType_t::OCCUPANCY, "OCCUPANCY"},
+                                 {KnobType_t::ARRAY_SIZE_PER_THREAD, "ARRAY_SIZE_PER_THREAD"},
+                                 {KnobType_t::SPLIT_COLS, "SPLIT_COLS"},
+                                 {KnobType_t::TILE_ROWS, "TILE_ROWS"},
+                                 {KnobType_t::TILE_COLS, "TILE_COLS"},
+                                 {KnobType_t::LOAD_SIZE, "LOAD_SIZE"},
+                                 {KnobType_t::CTA_COUNT, "CTA_COUNT"},
+                                 {KnobType_t::STREAM_K, "STREAM_K"},
+                                 {KnobType_t::SPLIT_P_SLC, "SPLIT_P_SLC"},
+                                 {KnobType_t::TILE_M, "TILE_M"},
+                                 {KnobType_t::TILE_N, "TILE_N"},
+                                 {KnobType_t::WARP_SPEC_CFG, "WARP_SPEC_CFG"},
+                             })
+
+#endif
+}  // namespace cudnn_frontend::graph
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_ConvDesc.h b/third_party/cudnn-frontend/include/cudnn_frontend_ConvDesc.h
new file mode 100644
index 00000000..0cbd2480
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_ConvDesc.h
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace cudnn_frontend {
+namespace graph {
+class ConvolutionNode;
+}
+}  // namespace cudnn_frontend
+namespace cudnn_frontend {
+namespace graph {
+class DgradNode;
+}
+}  // namespace cudnn_frontend
+namespace cudnn_frontend {
+namespace graph {
+class WgradNode;
+}
+}  // namespace cudnn_frontend
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+
+///
+/// Convolution Descriptor Class
+/// This class tells the properties of the Convolution operation
+/// Properties:
+///    - padLower
+///    - padUpper
+///    - Dilation
+///    - Stride
+///    - Math Operation Data Type
+///    - Convolution Mode
+///    - Convolution spatial dimensions
+///
+/// Use ConvDescBuilder_v8 to build this class.
+/// Describe returns a string describing the convolution operation
+///
+class ConvDesc_v8 : public BackendDescriptor {
+   public:
+    friend class ConvDescBuilder_v8;
+    friend class cudnn_frontend::graph::ConvolutionNode;
+    friend class cudnn_frontend::graph::DgradNode;
+    friend class cudnn_frontend::graph::WgradNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        char sep = ' ';
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR :" << " Datatype: " << json{compute_type} << " Mode: " << json{mode}
+#else
+        ss << "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR :" << " Datatype: " << int(compute_type) << " Mode: " << int(mode)
+#endif
+           << " Num Dimensions: " << nDims;
+        ss << " PadLower [";
+        for (auto i = 0; i < nDims; i++) {
+            ss << sep << padLower[i];
+            sep = ',';
+        }
+        ss << " ] PadUpper [";
+        for (auto i = 0; i < nDims; i++) {
+            ss << sep << padUpper[i];
+            sep = ',';
+        }
+        ss << " ] Dilation [";
+        for (auto i = 0; i < nDims; i++) {
+            ss << sep << dilation[i];
+            sep = ',';
+        }
+        ss << " ] Stride [";
+        for (auto i = 0; i < nDims; i++) {
+            ss << sep << stride[i];
+            sep = ',';
+        }
+        ss << "]";
+        return ss.str();
+    }
+
+    ConvDesc_v8(ConvDesc_v8 &&from) = default;
+    ConvDesc_v8 &
+    operator=(ConvDesc_v8 &&) = default;
+
+    ~ConvDesc_v8() = default;
+
+    // TODO: Deprecate in v1.0
+    DataType_t
+    getComputePrecision() const {
+        return compute_type;
+    }
+
+    DataType_t
+    getComputeType() const {
+        return compute_type;
+    }
+
+    int64_t
+    getSpatialDimCount() const {
+        return nDims;
+    }
+
+    int64_t const *
+    getPadding() const {
+        return padLower;
+    }
+
+    int64_t const *
+    getSpatialStride() const {
+        return stride;
+    }
+
+    int64_t const *
+    getDilation() const {
+        return dilation;
+    }
+
+    cudnnConvolutionMode_t
+    getMathMode() const {
+        return detail::convert_to_cudnn_type(mode);
+    }
+
+    // TODO: Deprecate in v1.0
+    int64_t
+    getDimensionCount() const {
+        return getSpatialDimCount();
+    }
+
+    // TODO: Deprecate in v1.0
+    int64_t const *
+    getStride() const {
+        return getSpatialStride();
+    }
+
+   private:
+    ConvDesc_v8()                    = default;
+    ConvDesc_v8(ConvDesc_v8 const &) = delete;
+    ConvDesc_v8 &
+    operator=(ConvDesc_v8 const &) = delete;
+
+    DataType_t compute_type             = DataType_t::FLOAT;               //! Convolution operation data type
+    ConvolutionMode_t mode              = ConvolutionMode_t::CONVOLUTION;  //! Convolution vs cross correlation
+    int64_t nDims                       = -1;                              //! number of dimensions
+    int64_t padLower[CUDNN_DIM_MAX + 1] = {0};                             //! d, h, w
+    int64_t padUpper[CUDNN_DIM_MAX + 1] = {0};                             //! d, h, w
+    int64_t dilation[CUDNN_DIM_MAX + 1] = {0};                             //! d, h, w
+    int64_t stride[CUDNN_DIM_MAX + 1]   = {-1};                            //! d, h, w
+};
+
+///
+/// ConvDescBuilder_v8 Class
+/// Helper class used to build ConvDesc_v8 class
+class ConvDescBuilder_v8 {
+   public:
+    /** @defgroup ConvDescBuilder_v8
+     *  Set individual property of ConvDesc_v8 class
+     *  @{
+     */
+    //! Set Datatype for the Convolution Operation
+    auto
+    setComputeType(DataType_t data_type) -> ConvDescBuilder_v8 & {
+        m_convDesc.compute_type = data_type;
+        return *this;
+    }
+    // To be deprecated in v1.0.
+    auto
+    setComputeType(cudnnDataType_t data_type_) -> ConvDescBuilder_v8 & {
+        m_convDesc.compute_type = detail::convert_from_cudnn_type(data_type_);
+        return *this;
+    }
+    //! Set Padding Lower of the convDesc
+    auto
+    setPrePadding(int64_t ndims, int64_t const *padding) -> ConvDescBuilder_v8 & {
+        std::copy(padding, padding + ndims, m_convDesc.padLower);
+        return *this;
+    }
+    //! Set Padding Upper of the convDesc
+    auto
+    setPostPadding(int64_t ndims, int64_t const *padding) -> ConvDescBuilder_v8 & {
+        std::copy(padding, padding + ndims, m_convDesc.padUpper);
+        return *this;
+    }
+    //! Set Dilation of the convDesc
+    auto
+    setDilation(int64_t ndims, int64_t const *dilation) -> ConvDescBuilder_v8 & {
+        std::copy(dilation, dilation + ndims, m_convDesc.dilation);
+        return *this;
+    }
+    //! Set Strides of the convDesc
+    auto
+    setSpatialStride(int64_t ndims, int64_t const *strides) -> ConvDescBuilder_v8 & {
+        std::copy(strides, strides + ndims, m_convDesc.stride);
+        return *this;
+    }
+    //! Set Num Spatial Dimensions of the convolution Operation
+    auto
+    setSpatialDimCount(int64_t nDims_) -> ConvDescBuilder_v8 & {
+        m_convDesc.nDims = nDims_;
+        return *this;
+    }
+    //! Set Convolution Mode of the convolution Operation
+    auto
+    setMathMode(ConvolutionMode_t mode_) -> ConvDescBuilder_v8 & {
+        m_convDesc.mode = mode_;
+        return *this;
+    }
+
+    auto
+    setMathMode(cudnnConvolutionMode_t mode_) -> ConvDescBuilder_v8 & {
+        m_convDesc.mode = detail::convert_from_cudnn_type(mode_);
+        return *this;
+    }
+    /** @} */
+
+    // TODO: Deprecate in v1.0
+    auto
+    setNDims(int64_t nDims_) -> ConvDescBuilder_v8 & {
+        return setSpatialDimCount(nDims_);
+    }
+    // TODO: Deprecate in v1.0
+    auto
+    setDataType(cudnnDataType_t data_type_) -> ConvDescBuilder_v8 & {
+        return setComputeType(data_type_);
+    }
+    // TODO: Deprecate in v1.0
+    auto
+    setComputePrecision(cudnnDataType_t data_type_) -> ConvDescBuilder_v8 & {
+        return setComputeType(data_type_);
+    }
+    // TODO: Deprecate in v1.0
+    auto
+    setStrides(int64_t ndims, int64_t const *strides) -> ConvDescBuilder_v8 & {
+        return setSpatialStride(ndims, strides);
+    }
+
+    //! constructs the ConvDesc_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    ConvDesc_v8 &&
+    build() {
+        // Sanity check if non-default fields have been set correctly.
+        if (m_convDesc.nDims <= 0) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: Check and Set the CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS field");
+            return std::move(m_convDesc);
+        };
+        if (m_convDesc.stride[0] <= 0) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: Check and Set the CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES field");
+            return std::move(m_convDesc);
+        }
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_convDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc, status, "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: Bad descriptor created");
+            return std::move(m_convDesc);
+        }
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc, status, "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_convDesc);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        cudnnDataType_t cudnn_data_type;
+        status = detail::convert_to_cudnn_type(m_convDesc.compute_type, cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_COMP_TYPE Failed");
+            return std::move(m_convDesc);
+        }
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_COMP_TYPE Failed");
+            return std::move(m_convDesc);
+        }
+
+        cudnnConvolutionMode_t mode_ = detail::convert_to_cudnn_type(m_convDesc.mode);
+
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_CONV_MODE,
+                                       CUDNN_TYPE_CONVOLUTION_MODE,
+                                       1,
+                                       &mode_);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_CONV_MODE Failed");
+            return std::move(m_convDesc);
+        }
+
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
+                                       CUDNN_TYPE_INT64,
+                                       1,
+                                       &m_convDesc.nDims);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS Failed");
+            return std::move(m_convDesc);
+        }
+
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
+                                       CUDNN_TYPE_INT64,
+                                       m_convDesc.nDims,
+                                       m_convDesc.padLower);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS Failed");
+            return std::move(m_convDesc);
+        }
+
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
+                                       CUDNN_TYPE_INT64,
+                                       m_convDesc.nDims,
+                                       m_convDesc.padUpper);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_POST_PADDINGS Failed");
+            return std::move(m_convDesc);
+        }
+
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_DILATIONS,
+                                       CUDNN_TYPE_INT64,
+                                       m_convDesc.nDims,
+                                       m_convDesc.dilation);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_DILATIONS Failed");
+            return std::move(m_convDesc);
+        }
+
+        status = detail::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
+                                       CUDNN_TYPE_INT64,
+                                       m_convDesc.nDims,
+                                       m_convDesc.stride);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc,
+                status,
+                "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES Failed");
+            return std::move(m_convDesc);
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_convDesc.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_convDesc, status, "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_convDesc);
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_convDesc)
+        return std::move(m_convDesc);
+    }
+
+    explicit ConvDescBuilder_v8()                  = default;
+    ~ConvDescBuilder_v8()                          = default;
+    ConvDescBuilder_v8(ConvDescBuilder_v8 &&)      = delete;
+    ConvDescBuilder_v8(ConvDescBuilder_v8 const &) = delete;
+    ConvDescBuilder_v8 &
+    operator=(ConvDescBuilder_v8 const &) = delete;
+
+   private:
+    ConvDesc_v8 m_convDesc;
+};
+
+using ConvDesc        = ConvDesc_v8;
+using ConvDescBuilder = ConvDescBuilder_v8;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Engine.h b/third_party/cudnn-frontend/include/cudnn_frontend_Engine.h
new file mode 100644
index 00000000..cb093cc9
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Engine.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "cudnn_frontend_OperationGraph.h"
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+
+///
+/// Engine_v8 Class
+/// This class tells the properties of the Engine_v8 on which performs the
+/// operation requested
+/// Properties:
+///    - Index
+///    - OperationGraph_v8
+///
+/// Use EngineBuilder_v8 to build this class.
+/// Describe returns a string describing the tensor class
+///
+class Engine_v8 : public BackendDescriptor {
+   private:
+    Engine_v8()                  = default;
+    Engine_v8(Engine_v8 const &) = delete;
+    Engine_v8 &
+    operator=(Engine_v8 const &) = delete;
+
+    /// Internal class which controls the different knobs for a given engine
+    /// Has min-max and stride as the options.
+    /// User has the option to set the required value as a choice.
+    class Knob {
+       public:
+        Knob(cudnnBackendKnobType_t type_, int64_t max, int64_t min, int64_t stride_)
+            : knobType(type_), maxValue(max), minValue(min), stride(stride_) {}
+
+        std::string
+        describe() const {
+            std::stringstream ss;
+            ss << "Knob:" << knobType;
+            ss << " Min: " << minValue;
+            ss << " Max: " << maxValue;
+            ss << " Stride: " << stride;
+            return ss.str();
+        }
+
+        void
+        setChoice(uint64_t val_) {
+            choice = val_;
+        }
+
+        int64_t
+        getChoice() const {
+            return choice;
+        }
+
+        cudnnBackendKnobType_t
+        getKnobType() const {
+            return knobType;
+        }
+
+        int64_t
+        getMinValue() const {
+            return minValue;
+        }
+
+        int64_t
+        getMaxValue() const {
+            return minValue;
+        }
+
+        int64_t
+        getStride() const {
+            return stride;
+        }
+
+       private:
+        cudnnBackendKnobType_t knobType = CUDNN_KNOB_TYPE_COUNTS;
+        int64_t maxValue = 0, minValue = 0, stride = 0;  //!< min, max and stride of the knob value
+        int64_t choice = -1;                             //!< Choice set by the user
+    };
+
+    ManagedOpaqueDescriptor opGraph = nullptr;
+    int64_t idx                     = -1;  //!< Global Index of the engine for the given operationGraph.
+    int64_t numKnobs                = 0;   //!< Count of the backend knobs in the engine
+    std::array<ManagedOpaqueDescriptor, CUDNN_KNOB_TYPE_COUNTS> bKnobs = {};  //!< Opaque pointer to the backend knobs
+    std::vector<Knob> knobs;
+    std::string opGraphTag;
+
+    //! Called from the constructor builds the internal knobs vector
+    void
+    buildKnobs() {
+        cudnnStatus_t status;
+        for (auto i = 0; i < numKnobs; i++) {
+            auto bKnob = bKnobs[i]->get_backend_descriptor();
+            cudnnBackendKnobType_t type;
+            int64_t maxValue, minValue, stride, elemCount;
+            status =
+                detail::get_attribute(bKnob, CUDNN_ATTR_KNOB_INFO_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, &elemCount, &type);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(this,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
+                                              "GetAttribute CUDNN_ATTR_KNOB_INFO_TYPE failed");
+            }
+            status = detail::get_attribute(
+                bKnob, CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE, CUDNN_TYPE_INT64, 1, &elemCount, &maxValue);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(this,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
+                                              "GetAttribute CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE Failed");
+            }
+            status = detail::get_attribute(
+                bKnob, CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE, CUDNN_TYPE_INT64, 1, &elemCount, &minValue);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(this,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
+                                              "GetAttribute CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE Failed");
+            }
+            status =
+                detail::get_attribute(bKnob, CUDNN_ATTR_KNOB_INFO_STRIDE, CUDNN_TYPE_INT64, 1, &elemCount, &stride);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(this,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
+                                              "GetAttribute CUDNN_ATTR_KNOB_INFO_STRIDE Failed");
+            }
+            knobs.emplace_back(Knob(type, maxValue, minValue, stride));
+        }
+    }
+
+   public:
+    friend class EngineBuilder_v8;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_ENGINE_DESCRIPTOR :";
+        ss << " ID: " << idx;
+        ss << " Has " << numKnobs << " knobs";
+        return ss.str();
+    }
+    Engine_v8(Engine_v8 &&from) = default;
+
+    Engine_v8 &
+    operator=(Engine_v8 &&) = default;
+    ~Engine_v8()            = default;
+
+    std::string const &
+    getTag() const {
+        return opGraphTag;
+    }
+
+    //! Returns a vector of knobs to the user for modification
+    std::vector<Knob> &
+    getSupportedKnobs() {
+        return knobs;
+    }
+
+    //! Returns a final vector of knobs. Used in EngineConfigBuilder
+    std::vector<Knob> const &
+    getFinalizedKnobs() const {
+        return knobs;
+    }
+
+    bool
+    knobs_set() const {
+        bool is_knob_set = false;
+        for (auto i = 0; i < numKnobs; i++) {
+            if (knobs[i].getChoice() != -1) {
+                is_knob_set = true;
+                break;
+            }
+        }
+        return is_knob_set;
+    }
+};
+
+///
+/// EngineBuilder_v8 Class
+/// Helper class used to build Engine_v8 class
+class EngineBuilder_v8 {
+   public:
+    /** @defgroup EngineBuilder_v8
+     *  Set individual property of Engine_v8 class
+     *  @{
+     */
+    //! Set operationGraph for the engine
+    auto
+    setOperationGraph(OperationGraph_v8 const &opGraph_) -> EngineBuilder_v8 & {
+        m_engine.opGraph    = opGraph_.get_desc();
+        m_engine.opGraphTag = opGraph_.getTag();
+        return *this;
+    }
+
+    //! Set operationGraph for the engine
+    auto
+    setOperationGraph(ManagedOpaqueDescriptor desc_) -> EngineBuilder_v8 & {
+        m_engine.opGraph = desc_;
+        return *this;
+    }
+    //! Set engine index for the engine
+    auto
+    setGlobalEngineIdx(int64_t idx_) -> EngineBuilder_v8 & {
+        m_engine.idx = idx_;
+        return *this;
+    }
+    /** @} */
+
+    //! constructs the Engine_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    Engine_v8 &&
+    build() {
+        if (m_engine.idx < 0) {
+            set_error_and_throw_exception(
+                &m_engine,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_ENGINE_DESCRIPTOR: Check and Set the CUDNN_ATTR_ENGINE_GLOBAL_INDEX to valid value");
+            return std::move(m_engine);
+        }
+        if (m_engine.opGraph == nullptr) {
+            set_error_and_throw_exception(
+                &m_engine,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_ENGINE_DESCRIPTOR: Check and Set CUDNN_ATTR_ENGINE_OPERATION_GRAPH to valid value");
+            return std::move(m_engine);
+        }
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_engine.initialize_managed_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine, status, "CUDNN_BACKEND_ENGINE_DESCRIPTOR: cudnnCreate Descriptor Failed");
+            return std::move(m_engine);
+        }
+
+        status = detail::set_attribute(m_engine.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_engine.opGraph->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine,
+                status,
+                "CUDNN_BACKEND_ENGINE_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINE_OPERATION_GRAPH Failed");
+            return std::move(m_engine);
+        }
+
+        status = detail::set_attribute(m_engine.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
+                                       CUDNN_TYPE_INT64,
+                                       1,
+                                       &m_engine.idx);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine,
+                status,
+                "CUDNN_BACKEND_ENGINE_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINE_GLOBAL_INDEX Failed");
+            return std::move(m_engine);
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_engine.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_engine, status, "CUDNN_BACKEND_ENGINE_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_engine);
+        }
+
+        for (size_t i = 0; i < m_engine.bKnobs.size(); i++) {
+            m_engine.bKnobs[i] = make_shared_backend_pointer(CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR);
+            if (m_engine.bKnobs[i]->is_good() == false) {
+                status = m_engine.bKnobs[i]->get_status();
+                set_error_and_throw_exception(
+                    &m_engine,
+                    status,
+                    "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR cudnnCreate Failed");
+            }
+        }
+
+        std::array<cudnnBackendDescriptor_t, CUDNN_KNOB_TYPE_COUNTS> bKnobs_ =
+            {};  //!< Opaque pointer to the backend knobs
+        for (std::uint32_t i = 0; i < m_engine.bKnobs.size(); i++) {
+            bKnobs_[i] = m_engine.bKnobs[i]->get_backend_descriptor();
+        }
+        status = detail::get_attribute(m_engine.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINE_KNOB_INFO,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       CUDNN_KNOB_TYPE_COUNTS,
+                                       &m_engine.numKnobs,
+                                       bKnobs_.data());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine,
+                status,
+                "CUDNN_BACKEND_ENGINE_DESCRIPTOR: GetAttribute CUDNN_ATTR_ENGINE_KNOB_INFO Query Failed");
+        }
+        m_engine.buildKnobs();
+        CUDNN_FE_LOG_LABEL_ENDL(m_engine);
+        return std::move(m_engine);
+    }
+
+    explicit EngineBuilder_v8()                = default;
+    ~EngineBuilder_v8()                        = default;
+    EngineBuilder_v8(EngineBuilder_v8 &&)      = delete;
+    EngineBuilder_v8(EngineBuilder_v8 const &) = delete;
+    EngineBuilder_v8 &
+    operator=(EngineBuilder_v8 const &) = delete;
+
+   private:
+    Engine_v8 m_engine;
+};
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_EngineConfig.h b/third_party/cudnn-frontend/include/cudnn_frontend_EngineConfig.h
new file mode 100644
index 00000000..15995ebd
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_EngineConfig.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_Engine.h"
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+///
+/// EngineConfig_v8 Class
+/// This class tells the Configuration of the Engine_v8 in terms of the knob
+/// choices
+/// Properties:
+///    - num knobs
+///    - Choice
+///    - Engine_v8
+///
+/// Use EngineConfigBuilder_v8 to build this class.
+/// Describe returns a string describing the tensor class
+///
+class EngineConfig_v8 : public BackendDescriptor {
+   public:
+    friend class EngineConfigBuilder_v8;
+
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR :";
+        ss << " Number of knobs: " << numKnobs;
+        return ss.str();
+    }
+
+    EngineConfig_v8 &
+    operator=(EngineConfig_v8 &&from) = default;
+
+    EngineConfig_v8(EngineConfig_v8 &&from) = default;
+
+    ~EngineConfig_v8() = default;
+
+    std::string const &
+    getTag() const {
+        return opGraphTag;
+    }
+
+   private:
+    EngineConfig_v8() : BackendDescriptor() {
+        cudnnStatus_t status;
+        for (size_t i = 0; i < bChoices.size(); i++) {
+            bChoices[i] = make_shared_backend_pointer(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR);
+            if (bChoices[i]->is_good() == false) {
+                status = bChoices[i]->get_status();
+                set_error_and_throw_exception(
+                    this,
+                    status,
+                    "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR cudnnCreate Failed");
+                break;
+            }
+        }
+    }
+    EngineConfig_v8(EngineConfig_v8 const &) = delete;
+    EngineConfig_v8 &
+    operator=(EngineConfig_v8 const &) = delete;
+
+    ManagedOpaqueDescriptor engine = nullptr;
+    int64_t numKnobs               = 0;
+    std::string opGraphTag;
+    bool set_knobs_attr                                                  = false;
+    std::array<ManagedOpaqueDescriptor, CUDNN_KNOB_TYPE_COUNTS> bChoices = {};  //!< Opaque pointer to the backend knobs
+};
+
+///
+/// EngineConfigBuilder_v8 Class
+/// Helper class used to build EngineConfig_v8 class
+class EngineConfigBuilder_v8 {
+   public:
+    /** @defgroup EngineConfigBuilder_v8
+     *  Set individual property of EngineConfig_v8 class
+     *  @{
+     */
+    //! Set engine for the EngineConfig_v8
+    auto
+    setEngine(Engine_v8 const &engine_) -> EngineConfigBuilder_v8 & {
+        m_engine_config.engine     = engine_.get_desc();
+        m_engine_config.opGraphTag = engine_.getTag();
+        auto &knobs                = engine_.getFinalizedKnobs();
+        m_engine_config.numKnobs   = knobs.size();
+
+        m_engine_config.set_knobs_attr = engine_.knobs_set();
+
+        for (std::uint32_t i = 0; i < knobs.size(); i++) {
+            cudnnStatus_t status;
+            cudnnBackendKnobType_t type = knobs[i].getKnobType();
+            int64_t value               = knobs[i].getChoice();
+            status                      = detail::set_attribute(m_engine_config.bChoices[i]->get_backend_descriptor(),
+                                           CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE,
+                                           CUDNN_TYPE_KNOB_TYPE,
+                                           1,
+                                           &type);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_engine_config,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: "
+                                              "CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR SetAttribute "
+                                              "CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE Failed");
+            }
+            status = detail::set_attribute(m_engine_config.bChoices[i]->get_backend_descriptor(),
+                                           CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE,
+                                           CUDNN_TYPE_INT64,
+                                           1,
+                                           &value);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_engine_config,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: "
+                                              "CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR SetAttribute "
+                                              "CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE Failed");
+            }
+            status = detail::finalize(m_engine_config.bChoices[i]->get_backend_descriptor());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_engine_config,
+                    status,
+                    "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR cudnnFinalize Failed");
+            }
+        }
+
+        return *this;
+    }
+    /** @} */
+
+    //! constructs the Engine_v8 Config by calling the cudnn API
+    //! Throws the appropriate error message
+    EngineConfig_v8 &&
+    build() {
+        if (m_engine_config.status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_engine_config,
+                                          m_engine_config.status,
+                                          "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: is not created properly");
+            return std::move(m_engine_config);
+        }
+        if (m_engine_config.engine == nullptr) {
+            set_error_and_throw_exception(
+                &m_engine_config,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: Check and Set the CUDNN_ATTR_ENGINECFG_ENGINE.");
+            return std::move(m_engine_config);
+        }
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_engine_config.initialize_managed_backend_pointer(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine_config, status, "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_engine_config);
+        }
+
+        status = detail::set_attribute(m_engine_config.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINECFG_ENGINE,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_engine_config.engine->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine_config,
+                status,
+                "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINECFG_ENGINE Failed");
+            return std::move(m_engine_config);
+        }
+
+        if (m_engine_config.set_knobs_attr && m_engine_config.numKnobs > 0) {
+            std::array<cudnnBackendDescriptor_t, CUDNN_KNOB_TYPE_COUNTS> bChoices_;
+            for (auto i = 0; i < m_engine_config.numKnobs; i++) {
+                bChoices_[i] = m_engine_config.bChoices[i]->get_backend_descriptor();
+            }
+            status = detail::set_attribute(m_engine_config.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           m_engine_config.numKnobs,
+                                           bChoices_.data());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_engine_config,
+                    status,
+                    "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINECFG_KNOB_CHOICES Failed");
+                return std::move(m_engine_config);
+            }
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_engine_config.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_engine_config, status, "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_engine_config);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL(m_engine_config);
+        return std::move(m_engine_config);
+    }
+
+    explicit EngineConfigBuilder_v8()                      = default;
+    ~EngineConfigBuilder_v8()                              = default;
+    EngineConfigBuilder_v8(EngineConfigBuilder_v8 &&)      = delete;
+    EngineConfigBuilder_v8(EngineConfigBuilder_v8 const &) = delete;
+    EngineConfigBuilder_v8 &
+    operator=(EngineConfigBuilder_v8 const &) = delete;
+
+   private:
+    EngineConfig_v8 m_engine_config;
+};
+
+///
+/// EngineConfigList class
+/// This is a RAII type class that holds naked
+/// EngineConfig backendDescriptor.
+/// The purpose of this class is to provide an
+/// easy interface to store the EngineConfigs generated
+/// from various source and apply a filter.
+
+using EngineConfigList = std::vector<ManagedOpaqueDescriptor>;
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_EngineConfigGenerator.h b/third_party/cudnn-frontend/include/cudnn_frontend_EngineConfigGenerator.h
new file mode 100644
index 00000000..432f678f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_EngineConfigGenerator.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <cudnn_frontend.h>
+
+namespace cudnn_frontend {
+
+/// Variety of renames.
+using executionPlans_t = std::vector<cudnn_frontend::ExecutionPlan>;
+using Predicate        = std::function<bool(cudnn_frontend::ExecutionPlan const &plan)>;
+using GeneratorSource  = std::function<cudnn_frontend::EngineConfigList(cudnn_frontend::OperationGraph &)>;
+
+enum class CudnnFindSamplingTechnique {
+    CUDNN_FIND_SAMPLE_ONCE,             //!< Sample once quick but may have unstable values
+    CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE,  //!< Sample 3 times and take median.
+    CUDNN_FIND_SAMPLE_TILL_STABLE       //!< Sample multiple times till stable.
+};
+
+/// EngineConfigGenerator class
+/// Contains a vector of methods that generate a vector of backend descriptor
+/// that can be used to create a plan for the method.
+class EngineConfigGenerator {
+   private:
+    std::vector<GeneratorSource> engine_config_generators;
+
+   public:
+    /// Constructor that takes int a array of function pointers that will be called later.
+    /// in the generate_engine_config function.
+    EngineConfigGenerator(int const sourceSize, GeneratorSource const *sources) {
+        for (int i = 0; i < sourceSize; i++) {
+            engine_config_generators.push_back(sources[i]);
+        }
+    };
+
+    /// Calls the vector of engine_config_generators one by one and concatenates the generated
+    /// engine together into a single list.
+    auto
+    generate_engine_config(cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList {
+        cudnn_frontend::EngineConfigList engine_configs;
+        for (auto fn : engine_config_generators) {
+            cudnn_frontend::EngineConfigList new_engine_config = fn(opGraph);
+            CUDNN_FE_LOG_LABEL_ENDL("Called engine config generator and produced " << new_engine_config.size()
+                                                                                   << " configs.");
+            std::copy(new_engine_config.begin(), new_engine_config.end(), std::back_inserter(engine_configs));
+            new_engine_config.clear();
+        }
+        return engine_configs;
+    }
+
+    /// Returns the concatenated plan in the order of heuristic results.
+    auto
+    cudnnGetPlan(cudnnHandle_t handle, cudnn_frontend::OperationGraph &opGraph, Predicate pred, size_t max_plans = 1000)
+        -> executionPlans_t;
+    auto
+    cudnnGetPlan(cudnnHandle_t handle, cudnn_frontend::OperationGraph &opGraph, size_t max_plans = 1000)
+        -> executionPlans_t;
+
+    /// Reruns the concatenated plans and measures the execution time following which
+    /// a sorted order of executionPlans are return to the user.
+    template <CudnnFindSamplingTechnique samplingTechnique>
+    auto
+    cudnnFindPlan(cudnnHandle_t handle,
+                  cudnn_frontend::OperationGraph &opGraph,
+                  cudnn_frontend::VariantPack const &variantPack,
+                  Predicate pred) -> executionPlans_t;
+
+    template <CudnnFindSamplingTechnique samplingTechnique>
+    auto
+    cudnnFindPlan(cudnnHandle_t handle,
+                  cudnn_frontend::OperationGraph &opGraph,
+                  cudnn_frontend::VariantPack const &variantPack) -> executionPlans_t;
+
+    template <CudnnFindSamplingTechnique samplingTechnique>
+    auto
+    cudnnFindPlanAndCache(
+        cudnnHandle_t handle,
+        cudnn_frontend::OperationGraph &opGraph,
+        cudnn_frontend::VariantPack const &variantPack,
+        cudnn_frontend::ExecutionPlanCache &cache,
+        Predicate pred = [](const cudnn_frontend::ExecutionPlan &) { return false; }) -> cudnn_frontend::ExecutionPlan;
+};
+
+/// Filter out the execution plan based on the prerequisite conditions.
+/// Goes through vector of execution plans and if the predicate returns
+/// not to block (false), it is inserted into the filtered plans.
+static auto
+filter(Predicate pred, executionPlans_t &plans) -> executionPlans_t {
+    executionPlans_t filtered_plans;
+    for (auto &plan : plans) {
+        CUDNN_FE_LOG_LABEL("Filtered ");
+        if (!pred(plan)) {
+            CUDNN_FE_LOG("and Added ");
+            filtered_plans.emplace_back(std::move(plan));
+        }
+        if (filtered_plans.size()) {
+            CUDNN_FE_LOG(filtered_plans.back().getTag() << std::endl);
+        }
+    }
+    CUDNN_FE_LOG_LABEL_ENDL("Filtered plans count " << filtered_plans.size());
+    return filtered_plans;
+}
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_EngineFallbackList.h b/third_party/cudnn-frontend/include/cudnn_frontend_EngineFallbackList.h
new file mode 100644
index 00000000..88f4fd1b
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_EngineFallbackList.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <numeric>
+#include "cudnn_frontend_Heuristics.h"
+
+namespace cudnn_frontend {
+
+[[maybe_unused]] auto static get_fallback_engine_list(DescriptorType_t mode, const std::string &opGraphTag)
+    -> std::vector<int> {
+    auto major_version = detail::get_backend_version() / 1000;
+
+    auto minor_version = (detail::get_backend_version() / 100) % 10;
+    if (major_version >= 8) {
+        if (minor_version <= 2) {
+            /// Here we are using the term "bias" in the operationGraph as a proxy for
+            /// the conv*bias* operation graph. We are not strictly checking the order of
+            /// the operations in the graph. We propose this as a temporary workaround until
+            /// the backend API supports querying the fallback list directly from cudnn
+            if (mode == DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) {
+                if (opGraphTag.find("bias") == std::string::npos) {
+                    std::vector<int> engine_list(50);
+                    std::iota(engine_list.begin(), engine_list.end(), 0);
+                    return engine_list;
+                } else {
+                    return {11, 0};
+                }
+            } else if (mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+                std::vector<int> engine_list(61);
+                std::iota(engine_list.begin(), engine_list.end(), 0);
+                return engine_list;
+            } else if (mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) {
+                return {0, 1, 20};
+            } else {
+                return {};
+            }
+        } else {
+            if (mode == DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) {
+                if (opGraphTag.find("bias") == std::string::npos) {
+                    return {0, 1, 28};
+                } else {
+                    return {};
+                }
+            } else if (mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+                return {0, 1, 25};
+            } else if (mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) {
+                return {0, 1, 20};
+            } else {
+                return {};
+            }
+        }
+    } else {
+        return {};
+    }
+}
+
+class EngineFallbackList_v8 : public BackendDescriptor {
+   public:
+    friend class EngineFallbackListBuilder_v8;
+
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_FALLBACK ENGINES :";
+        return ss.str();
+    }
+
+    auto
+    getFallbackList() -> std::vector<ManagedOpaqueDescriptor> & {
+        return m_engine_configs;
+    }
+
+    ~EngineFallbackList_v8() = default;
+
+    EngineFallbackList_v8(EngineFallbackList_v8 &&from)
+        : BackendDescriptor(from.get_desc(), from.get_status(), from.get_error()),
+          opGraph(from.opGraph),
+          mode(from.mode),
+          num_ops(from.num_ops),
+          opGraphTag(from.opGraphTag) {
+        m_engine_configs.swap(from.m_engine_configs);
+    }
+
+   private:
+    EngineFallbackList_v8()                              = default;
+    EngineFallbackList_v8(EngineFallbackList_v8 const &) = delete;
+    EngineFallbackList_v8 &
+    operator=(EngineFallbackList_v8 const &) = delete;
+
+    ManagedOpaqueDescriptor opGraph = nullptr;
+    DescriptorType_t mode;
+    uint64_t num_ops;
+    std::vector<ManagedOpaqueDescriptor> m_engine_configs;
+    std::string opGraphTag;
+};
+
+///
+/// EngineFallBackListBuilder Class
+/// Helper class used to build EngineFallBackList class
+class EngineFallbackListBuilder_v8 {
+   public:
+    /** @defgroup EngineFallbackListBuilder_v8
+     *  Set individual property of EngineFallbackList_v8 class
+     *  @{
+     */
+    //! Set operationGraph for the engine (opGraph is not destroyed)
+    auto
+    setOperationGraph(OperationGraph_v8 &opGraph_) -> EngineFallbackListBuilder_v8 & {
+        m_fallback_list.opGraph    = opGraph_.get_desc();
+        m_fallback_list.opGraphTag = opGraph_.getTag();
+        m_fallback_list.num_ops    = opGraph_.getOpCount();
+        return *this;
+    }
+
+    auto
+    setOperation(DescriptorType_t mode) -> EngineFallbackListBuilder_v8 & {
+        m_fallback_list.mode = mode;
+        return *this;
+    }
+
+    auto
+    setOperation(cudnnBackendDescriptorType_t mode) -> EngineFallbackListBuilder_v8 & {
+        m_fallback_list.mode = detail::convert_from_cudnn_type(mode);
+        return *this;
+    }
+    /** @} */
+
+    //! constructs the EngineFallbackList_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    EngineFallbackList_v8 &&
+    build() {
+        if (m_fallback_list.opGraph == nullptr) {
+            set_error_and_throw_exception(&m_fallback_list,
+                                          CUDNN_STATUS_BAD_PARAM,
+                                          "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: Check and Set the "
+                                          "CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH field for heuristic");
+            return std::move(m_fallback_list);
+        };
+        auto fallback_heuristics = EngineHeuristicsBuilder_v8()
+                                       .setHeurMode(CUDNN_HEUR_MODE_FALLBACK)
+                                       .setOperationGraph(m_fallback_list.opGraph, m_fallback_list.opGraphTag)
+                                       .build();
+        auto count                       = fallback_heuristics.getEngineConfigCount();
+        m_fallback_list.m_engine_configs = fallback_heuristics.getEngineConfig(count);
+        CUDNN_FE_LOG_LABEL_ENDL(m_fallback_list);
+        return std::move(m_fallback_list);
+    }
+
+    explicit EngineFallbackListBuilder_v8()                            = default;
+    ~EngineFallbackListBuilder_v8()                                    = default;
+    EngineFallbackListBuilder_v8(EngineFallbackListBuilder_v8 &&)      = delete;
+    EngineFallbackListBuilder_v8(EngineFallbackListBuilder_v8 const &) = delete;
+    EngineFallbackListBuilder_v8 &
+    operator=(EngineFallbackListBuilder_v8 const &) = delete;
+
+   private:
+    EngineFallbackList_v8 m_fallback_list;
+};
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Errata.h b/third_party/cudnn-frontend/include/cudnn_frontend_Errata.h
new file mode 100644
index 00000000..3cda0d07
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Errata.h
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../include/cudnn_frontend_Logging.h"
+
+#include <cstdlib>
+#include <fstream>
+#pragma once
+
+namespace cudnn_frontend {
+
+// Loads the json handle from the json file
+// json file is defined by environment variable
+// CUDNN_ERRATA_JSON_FILE. If the environment variable
+// is not set the value set in the API is considered.
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+[[maybe_unused]] static bool
+load_from_config(json &json_handle, const std::string &errata_json) {
+    const char *err_json = get_environment("CUDNN_ERRATA_JSON_FILE");
+    if (err_json == NULL && errata_json == "") {
+        return false;
+    }
+    if (err_json == NULL) {
+        err_json = errata_json.c_str();
+    }
+    std::ifstream ifs(err_json, std::ifstream::in);
+    if (!ifs.is_open() || !ifs.good()) {
+        return false;
+    }
+    ifs >> json_handle;
+    return true;
+}
+#endif
+
+/**
+ * @brief Checks the shape of an operation to compare against errata filter height and width for kernel blocking
+ *
+ * @param op The operation's tensors to check
+ * @param shape_format The shape format of the tensor (NCHW vs NHWC)
+ * @param tensor_attr The cudnnBackendAttributeName_t of the tensor's shape we want to check
+ * @param blocked_height The height we want to filter out
+ * @param blocked_width The width we want to filter out
+ * @param blocked_channels The channels we want to filter out. Defaults to -1 (not filter out channels)
+ * @return true The passed in operation shape matches the blocked shape
+ * @return false The passed in operation shape does not match the blocked shape
+ */
+static bool
+check_shape(cudnnBackendDescriptor_t &op,
+            const std::string &shape_format,
+            cudnnBackendAttributeName_t tensor_attr,
+            const std::vector<int64_t> &blocked_shape) {
+    // Get backend descriptor to individual tensor to be able to get shape
+    ManagedOpaqueDescriptor tensor   = make_shared_backend_pointer(CUDNN_BACKEND_TENSOR_DESCRIPTOR);
+    cudnnBackendDescriptor_t tensor_ = tensor->get_backend_descriptor();
+    int64_t count                    = 0;
+    cudnnStatus_t status = detail::get_attribute(op, tensor_attr, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &count, &tensor_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnnException(std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(),
+                             status);
+#endif
+    }
+
+    // Get tensor dims
+    std::array<int64_t, 5> tensor_dims;
+    status =
+        detail::get_attribute(tensor_, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 5, &count, tensor_dims.data());
+    if (status != CUDNN_STATUS_SUCCESS) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnnException(std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(),
+                             status);
+#endif
+    }
+    // tensor_dims is 1 indexed
+    int64_t first_dim = tensor_dims[1];  // batch size for input/output tensor, output channels for filter tensor
+    int64_t blocked_first_dim = blocked_shape[0];
+
+    // Defaults to true becuase -1 means we don't filter that out (Wildcard). If something later blocks, then the
+    // comparison will be correct
+    bool blocked = (blocked_first_dim != -1) ? (first_dim == blocked_first_dim) : true;
+
+    // Check for shape format to extract the right dimension. Filter shape will always be "NCHW" for convenience.
+    int64_t channels         = (shape_format == "NCHW") ? tensor_dims[2] : tensor_dims[4];  // channels
+    int64_t blocked_channels = (shape_format == "NCHW") ? blocked_shape[1] : blocked_shape[3];
+    blocked                  = (blocked_channels != -1) ? (blocked && channels == blocked_channels) : true;
+
+    int64_t height         = (shape_format == "NCHW") ? tensor_dims[3] : tensor_dims[2];
+    int64_t blocked_height = (shape_format == "NCHW") ? blocked_shape[2] : blocked_shape[1];
+    blocked                = (blocked_height != -1) ? (blocked && height == blocked_height) : true;
+
+    int64_t width         = (shape_format == "NCHW") ? tensor_dims[4] : tensor_dims[3];
+    int64_t blocked_width = (shape_format == "NCHW") ? blocked_shape[3] : blocked_shape[2];
+    blocked               = (blocked_width != -1) ? (blocked && width == blocked_width) : true;
+
+    return blocked;
+}
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+template <typename T>
+static bool
+check_rule(const json &json_handle, const std::string &executionPlanTag, cudnnHandle_t handle, T fn) {
+    std::string operation = json_handle["operation"];
+    int64_t engine        = json_handle["engine"];
+    uint64_t cudnn_start  = 0;
+    uint64_t cudnn_end    = std::numeric_limits<uint64_t>::max();
+    if (json_handle.contains("cudnn_version_start")) {
+        cudnn_start = json_handle["cudnn_version_start"];
+    }
+    if (json_handle.contains("cudnn_version_end")) {
+        cudnn_end = json_handle["cudnn_version_end"];
+    }
+    std::string tag_prefix = operation + "_eng" + std::to_string(engine) + "_";
+    std::string mod_tag    = executionPlanTag + "_";
+    bool blocked           = tag_prefix.size() <= mod_tag.size() &&
+                   std::equal(tag_prefix.begin(), tag_prefix.end(), mod_tag.begin()) && CUDNN_VERSION >= cudnn_start &&
+                   CUDNN_VERSION < cudnn_end;
+
+    if (blocked && json_handle.contains("knob")) {  // Short circuit if operation and engine do not match
+        for (auto &kv : json_handle["knob"]) {
+            blocked = blocked && (executionPlanTag.find(kv) != std::string::npos);
+        }
+    }
+    blocked = blocked && fn();
+    return blocked;
+
+    CUDNN_FRONTEND_UNUSED(handle);
+}
+
+// Overload for check_rule to take in an operation graph for shape filtering
+template <typename T>
+static bool
+check_rule(const json &json_handle,
+           const std::string &executionPlanTag,
+           cudnnHandle_t handle,
+           T fn,
+           const OperationGraph &opGraph) {
+    std::string operation = json_handle["operation"];
+    int64_t engine        = json_handle["engine"];
+    uint64_t cudnn_start  = 0;
+    uint64_t cudnn_end    = std::numeric_limits<uint64_t>::max();
+    if (json_handle.contains("cudnn_version_start")) {
+        cudnn_start = json_handle["cudnn_version_start"];
+    }
+    if (json_handle.contains("cudnn_version_end")) {
+        cudnn_end = json_handle["cudnn_version_end"];
+    }
+    std::string tag_prefix = operation + "_eng" + std::to_string(engine) + "_";
+    std::string mod_tag    = executionPlanTag + "_";
+    bool blocked           = tag_prefix.size() <= mod_tag.size() &&
+                   std::equal(tag_prefix.begin(), tag_prefix.end(), mod_tag.begin()) && CUDNN_VERSION >= cudnn_start &&
+                   CUDNN_VERSION < cudnn_end;
+
+    if (blocked && json_handle.contains("knob")) {  // Short circuit if operation and engine do not match
+        for (auto &kv : json_handle["knob"]) {
+            blocked = blocked && (executionPlanTag.find(kv) != std::string::npos);
+        }
+    }
+
+    if (blocked &&
+        json_handle.contains("input_shape")) {  // Check if user wants to block kernel for specific input shape
+        if (!json_handle.contains("shape_format")) {
+            std::string message =
+                "ERROR: Please set a shape format (e.g. shape_format: \"NCWH\") for errata filters using input/kernel "
+                "shape";
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+            throw cudnnException(message.c_str(), CUDNN_STATUS_BAD_PARAM);
+#else
+            CUDNN_FE_LOG(message << std::endl);
+            return blocked;
+#endif
+        }
+
+        std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> ops = opGraph.getOps();
+        std::array<cudnnBackendDescriptor_t, MAX_OPGRAPH_OPS> ops_;
+        for (unsigned int i = 0; i < opGraph.getOpCount(); i++) {
+            ops_[i] = ops[i]->get_backend_descriptor();
+        }
+
+        std::string shape_format           = json_handle["shape_format"];
+        std::vector<int64_t> blocked_shape = json_handle["input_shape"];
+
+        // Forward conv operation
+        if (operation == "ConvFwd") {
+            blocked = blocked &&
+                      check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, blocked_shape);
+
+            // Operation is conv wgrad
+        } else if (operation == "ConvBwdFilter") {
+            blocked = blocked &&
+                      check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X, blocked_shape);
+
+            // Operation is conv dgrad
+        } else if (operation == "ConvBwdData") {
+            blocked = blocked &&
+                      check_shape(ops_[0], shape_format, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX, blocked_shape);
+        }
+    }
+
+    if (blocked &&
+        json_handle.contains("filter_shape")) {  // Check if user wants to block kernel for specific filter shape
+        std::array<ManagedOpaqueDescriptor, 50> ops = opGraph.getOps();
+        std::array<cudnnBackendDescriptor_t, 50> ops_;
+        for (unsigned int i = 0; i < opGraph.getOpCount(); i++) {
+            ops_[i] = ops[i]->get_backend_descriptor();
+        }
+
+        std::vector<int64_t> blocked_shape = json_handle["filter_shape"];
+
+        // Forward conv operation
+        if (operation == "ConvFwd") {
+            // Filter format is always [output channels, input channels, height, width] so we hardcode "NCHW" to match
+            // and not repeat code
+            blocked =
+                blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, blocked_shape);
+
+            // Operation is conv wgrad
+        } else if (operation == "ConvBwdFilter") {
+            blocked =
+                blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW, blocked_shape);
+
+            // Operation is conv dgrad
+        } else if (operation == "ConvBwdData") {
+            blocked =
+                blocked && check_shape(ops_[0], "NCHW", CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W, blocked_shape);
+        }
+    }
+    blocked = blocked && fn();
+    return blocked;
+
+    CUDNN_FRONTEND_UNUSED(handle);
+}
+
+// Takes in an initialzed json handle and checks if it satisfies the
+// condition for running it. Returns true if the given executionPlanTag
+// is faulty.
+
+template <typename T>
+static bool
+check_errata(const json &json_handle, const std::string &executionPlanTag, cudnnHandle_t handle, T fn) {
+    CUDNN_FE_LOG_LABEL("Verifying " << executionPlanTag);
+    for (auto const &rule : json_handle["rules"]) {
+        if (check_rule<T>(rule, executionPlanTag, handle, fn)) {
+            CUDNN_FE_LOG(". Blocking." << std::endl);
+            return true;
+        }
+    }
+
+    CUDNN_FE_LOG(". Passed." << std::endl);
+    return false;
+}
+
+// Overload. Takes in an initialzed json handle, an execution plan tag, and a operation graph and checks if it satisfies
+// the condition for running it. Returns true if the given executionPlanTag + operation graph is faulty
+template <typename T>
+static bool
+check_errata(const json &json_handle,
+             const std::string &executionPlanTag,
+             cudnnHandle_t handle,
+             const OperationGraph &opGraph,
+             T fn) {
+    CUDNN_FE_LOG_LABEL("Verifying " << executionPlanTag);
+    for (auto const &rule : json_handle["rules"]) {
+        if (check_rule<T>(rule, executionPlanTag, handle, fn, opGraph)) {
+            CUDNN_FE_LOG(". Blocking." << std::endl);
+            return true;
+        }
+    }
+
+    CUDNN_FE_LOG(". Passed." << std::endl);
+    return false;
+}
+#endif
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_ExecutionPlan.h b/third_party/cudnn-frontend/include/cudnn_frontend_ExecutionPlan.h
new file mode 100644
index 00000000..9d0a8972
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_ExecutionPlan.h
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_EngineConfig.h"
+#include "cudnn_frontend_Engine.h"
+#include "cudnn_frontend_utils.h"
+#include "cudnn_frontend/backend/kernel_cache.h"
+#include "cudnn_frontend/backend/device_properties.h"
+
+namespace cudnn_frontend {
+///
+/// ExecutionPlan_v8 Class
+/// This class tells the Configuration of the Engine in terms of the knob
+/// choices
+/// Properties:
+///    - num knobs
+///    - Choice
+///    - Engine
+///
+/// Use ExecutionPlanBuilder_v8 to build this class.
+/// Describe returns a string describing the tensor class
+///
+class ExecutionPlan_v8 : public BackendDescriptor {
+   public:
+    friend class ExecutionPlanBuilder_v8;
+
+    ExecutionPlan_v8(ExecutionPlan_v8 &&from) = default;
+    ExecutionPlan_v8 &
+    operator=(ExecutionPlan_v8 &&) = default;
+
+    ~ExecutionPlan_v8() = default;
+    /** @defgroup ExecutionPlanQuery
+     *  Query individual property of ExecutionPlan_v8 class
+     *  @{
+     */
+    //! Query the workspace requirement for the given plan
+    auto
+    getWorkspaceSize(void) const -> int64_t {
+        return workSpaceSize;
+    }
+
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR : ";
+        ss << getTag() << ", ";
+        ss << "numeric_notes:" << "[";
+        for (auto note : numeric_notes_vec) {
+            ss << cudnn_frontend::to_string(note) << ",";
+        }
+        ss << "] behavior_notes:" << "[";
+        for (auto note : behavior_notes_vec) {
+            ss << cudnn_frontend::to_string(note) << ",";
+        }
+        ss << "] workSpaceSize: " << workSpaceSize;
+        return ss.str();
+    }
+
+    std::string const &
+    getTag() const {
+        return planTag;
+    }
+
+    void
+    setExecutionTime(float time_) {
+        execution_time_ms = time_;
+    }
+
+    float
+    getExecutionTime() const {
+        return execution_time_ms;
+    }
+
+    std::vector<cudnnBackendNumericalNote_t> const &
+    getAllNumericNotes() const {
+        return numeric_notes_vec;
+    }
+
+    std::array<cudnnBackendNumericalNote_t, CUDNN_NUMERICAL_NOTE_TYPE_COUNT> const &
+    getNumericNotes() const {
+        return numeric_notes;
+    }
+
+    std::array<cudnnBackendBehaviorNote_t, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT> const &
+    getBehaviorNotes() const {
+        return behavior_notes;
+    }
+    std::vector<cudnnBackendBehaviorNote_t> const &
+    getAllBehaviorNotes() const {
+        return behavior_notes_vec;
+    }
+
+    std::string
+    getJsonRepresentation() const {
+        auto status = CUDNN_STATUS_SUCCESS;
+        int64_t serializationSize;
+        std::vector<char> serialization_buf;
+        status = detail::get_attribute(pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
+                                       CUDNN_TYPE_CHAR,
+                                       0,
+                                       &serializationSize,
+                                       nullptr);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION Failed");
+        }
+        serialization_buf.resize(static_cast<size_t>(serializationSize));
+        status = detail::get_attribute(pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
+                                       CUDNN_TYPE_CHAR,
+                                       serializationSize,
+                                       &serializationSize,
+                                       serialization_buf.data());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION Failed");
+        }
+        std::string json_string(serialization_buf.begin(), serialization_buf.end());
+        return json_string;
+    }
+
+    ExecutionPlan_v8(ExecutionPlan_v8 const &) = default;
+    ExecutionPlan_v8 &
+    operator=(ExecutionPlan_v8 const &) = default;
+
+   private:
+    void
+    fetchNotes(ManagedOpaqueDescriptor &extractedEngine) {
+        auto status                               = CUDNN_STATUS_SUCCESS;
+        int64_t elem_count                        = 0;
+        cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
+        status                                    = detail::get_attribute(extractedEngine_,
+                                       CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                       CUDNN_TYPE_NUMERICAL_NOTE,
+                                       CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                       &elem_count,
+                                       nullptr);
+        numeric_notes_vec.resize(static_cast<size_t>(elem_count));
+        status = detail::get_attribute(extractedEngine_,
+                                       CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                       CUDNN_TYPE_NUMERICAL_NOTE,
+                                       CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                       &elem_count,
+                                       numeric_notes_vec.data());
+        ptrdiff_t end =
+            static_cast<ptrdiff_t>(std::min(elem_count, static_cast<int64_t>(CUDNN_NUMERICAL_NOTE_TYPE_COUNT)));
+        std::copy(numeric_notes_vec.begin(), numeric_notes_vec.begin() + end, numeric_notes.begin());
+        if (static_cast<size_t>(elem_count) < numeric_notes.size())
+            std::fill_n(numeric_notes.begin() + static_cast<size_t>(elem_count),
+                        numeric_notes.size() - static_cast<size_t>(elem_count),
+                        CUDNN_NUMERICAL_NOTE_TYPE_COUNT);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_ENGINE_NUMERICAL_NOTE Failed");
+        }
+        status = detail::get_attribute(extractedEngine_,
+                                       CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                       CUDNN_TYPE_BEHAVIOR_NOTE,
+                                       CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                       &elem_count,
+                                       nullptr);
+        behavior_notes_vec.resize(static_cast<size_t>(elem_count));
+        status = detail::get_attribute(extractedEngine_,
+                                       CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                       CUDNN_TYPE_BEHAVIOR_NOTE,
+                                       CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                       &elem_count,
+                                       behavior_notes_vec.data());
+        end    = static_cast<ptrdiff_t>(std::min(elem_count, static_cast<int64_t>(CUDNN_BEHAVIOR_NOTE_TYPE_COUNT)));
+        std::copy(behavior_notes_vec.begin(), behavior_notes_vec.begin() + end, behavior_notes.begin());
+        if (static_cast<size_t>(elem_count) < behavior_notes.size())
+            std::fill_n(behavior_notes.begin() + static_cast<size_t>(elem_count),
+                        behavior_notes.size() - static_cast<size_t>(elem_count),
+                        CUDNN_BEHAVIOR_NOTE_TYPE_COUNT);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE Failed");
+        }
+    }
+
+    void
+    buildTag(ManagedOpaqueDescriptor &extractedEngine) {
+        // Compute a unique tag for execution plan:
+        auto status = CUDNN_STATUS_SUCCESS;
+        std::stringstream tag{""};
+        int64_t elemCount = 0, engineId = 0, numKnobs = 0;
+
+        std::array<ManagedOpaqueDescriptor, CUDNN_KNOB_TYPE_COUNTS> extractedKnobs{{nullptr}};
+        for (auto &knob : extractedKnobs) {
+            knob   = make_shared_backend_pointer(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR);
+            status = knob->get_status();
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    this, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnCreate Failed when compute tag");
+            }
+        }
+
+        cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
+        std::array<cudnnBackendDescriptor_t, CUDNN_KNOB_TYPE_COUNTS> extractedKnobs_{{nullptr}};
+        for (std::uint32_t i = 0; i < extractedKnobs.size(); i++) {
+            extractedKnobs_[i] = extractedKnobs[i]->get_backend_descriptor();
+        }
+
+        status = detail::get_attribute(
+            extractedEngine_, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &elemCount, &engineId);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_ENGINE_GLOBAL_INDEX Failed");
+        }
+        tag << "eng" << engineId;
+
+        status = detail::get_attribute(engine_config->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       CUDNN_KNOB_TYPE_COUNTS,
+                                       &numKnobs,
+                                       &(extractedKnobs_[0]));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_ENGINECFG_KNOB_CHOICES Failed");
+        }
+        if (numKnobs > CUDNN_KNOB_TYPE_COUNTS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "numKnobs exceed the CUDNN_KNOB_TYPE_COUNTS");
+        }
+        for (size_t idx = 0; idx < static_cast<size_t>(numKnobs); ++idx) {
+            const cudnnBackendDescriptor_t &knob = extractedKnobs_[idx];
+            cudnnBackendKnobType_t type          = CUDNN_KNOB_TYPE_COUNTS;
+            int64_t choice                       = -2;
+            status =
+                detail::get_attribute(knob, CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, nullptr, &type);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(this,
+                                              status,
+                                              "computeTag CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                              "CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE Failed");
+            }
+            status =
+                detail::get_attribute(knob, CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE, CUDNN_TYPE_INT64, 1, nullptr, &choice);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(this,
+                                              status,
+                                              "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                              "CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE Failed");
+            }
+            tag << "_k" << type << "=" << choice;
+        }
+        planTag += tag.str();
+    }
+
+    void
+    computeWorkSpaceSize() {
+        auto status = detail::get_attribute(pointer->get_backend_descriptor(),
+                                            CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
+                                            CUDNN_TYPE_INT64,
+                                            1,
+                                            nullptr,
+                                            &workSpaceSize);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE Failed");
+        }
+        if (workSpaceSize < 0) {
+            set_error_and_throw_exception(
+                this, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute Workspace Size Invalid");
+        }
+    }
+
+    ExecutionPlan_v8()                    = default;
+    ManagedOpaqueDescriptor engine_config = nullptr;
+    cudnnHandle_t handle                  = nullptr;
+    std::string planTag;
+
+    std::int64_t workSpaceSize = 0;
+    std::array<cudnnBackendNumericalNote_t, CUDNN_NUMERICAL_NOTE_TYPE_COUNT> numeric_notes;
+    std::vector<cudnnBackendNumericalNote_t> numeric_notes_vec;
+    std::array<cudnnBackendBehaviorNote_t, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT> behavior_notes;
+    std::vector<cudnnBackendBehaviorNote_t> behavior_notes_vec;
+
+    float execution_time_ms                   = 0.0f;
+    std::shared_ptr<KernelCache> kernel_cache = nullptr;
+};
+
+///
+/// ExecutionPlanBuilder_v8 Class
+/// Helper class used to build ExecutionPlan_v8 class
+class ExecutionPlanBuilder_v8 {
+   public:
+    /** @defgroup ExecutionPlanBuilder_v8
+     *  Set individual property of ExecutionPlan_v8 class
+     *  @{
+     */
+    //! Set engine for the ExecutionPlan_v8
+    auto
+    setHandle(cudnnHandle_t handle_) -> ExecutionPlanBuilder_v8 & {
+        m_execution_plan.handle = handle_;
+        return *this;
+    }
+    //! Set engine Config for the Plan
+    auto
+    setEngineConfig(EngineConfig_v8 const &engine_config_) -> ExecutionPlanBuilder_v8 & {
+        m_execution_plan.engine_config = engine_config_.get_desc();
+        m_execution_plan.planTag       = engine_config_.getTag();
+        return *this;
+    }
+
+    auto
+    setKernelCache(std::shared_ptr<KernelCache> kernel_cache) -> ExecutionPlanBuilder_v8 & {
+        m_execution_plan.kernel_cache = kernel_cache;
+        return *this;
+    }
+
+    //! Set engine Config for the Plan
+    auto
+    setEngineConfig(ManagedOpaqueDescriptor &desc, std::string const &opGraphTag_ = "") -> ExecutionPlanBuilder_v8 & {
+        m_execution_plan.engine_config = desc;
+        m_execution_plan.planTag       = opGraphTag_;
+        return *this;
+    }
+
+    auto
+    setEngineConfig(ManagedOpaqueDescriptor const &desc, std::string const &opGraphTag_ = "")
+        -> ExecutionPlanBuilder_v8 & {
+        m_execution_plan.engine_config = desc;
+        m_execution_plan.planTag       = opGraphTag_;
+        return *this;
+    }
+    /** @} */
+
+    //! constructs the Engine Config by calling the cudnn API
+    //! Throws the appropriate error message
+    ExecutionPlan_v8 &&
+    build() {
+        // NOTE: skipping the handle and device properties here which are required for plan deserialization only
+
+        if (m_execution_plan.engine_config == nullptr) {
+            set_error_and_throw_exception(
+                &m_execution_plan,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: Check and Set the CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG");
+            return std::move(m_execution_plan);
+        };
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_execution_plan.initialize_managed_backend_pointer(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_execution_plan, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_execution_plan);
+        }
+
+        status = detail::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_execution_plan.engine_config->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_execution_plan,
+                status,
+                "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: SetAttribute CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG Failed");
+            return std::move(m_execution_plan);
+        }
+
+#if (CUDNN_VERSION >= 90400)
+        if (m_execution_plan.kernel_cache) {
+            status = detail::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &m_execution_plan.kernel_cache->get_ptr());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_execution_plan,
+                                              status,
+                                              "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: SetAttribute "
+                                              "CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE Failed");
+                return std::move(m_execution_plan);
+            }
+        }
+#endif
+        // Finalizing the descriptor
+        status = detail::finalize(m_execution_plan.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_execution_plan, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed");
+            return std::move(m_execution_plan);
+        }
+
+        ManagedOpaqueDescriptor extractedEngine = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+        status                                  = extractedEngine->get_status();
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnCreate of "
+                                          "CUDNN_BACKEND_ENGINE_DESCRIPTOR failed when compute tag");
+            return std::move(m_execution_plan);
+        }
+        cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
+        int64_t elemCount                         = 0;
+        status = detail::get_attribute(m_execution_plan.engine_config->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINECFG_ENGINE,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &elemCount,
+                                       &extractedEngine_);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_ENGINECFG_ENGINE Failed");
+            return std::move(m_execution_plan);
+        }
+
+        m_execution_plan.buildTag(extractedEngine);
+        m_execution_plan.fetchNotes(extractedEngine);
+        m_execution_plan.computeWorkSpaceSize();
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_execution_plan);
+        return std::move(m_execution_plan);
+    }
+
+    ExecutionPlan_v8 &&
+    loadFromJson(const std::string &json_plan) {
+        CUDNN_FRONTEND_UNUSED(json_plan);
+        auto status = CUDNN_STATUS_SUCCESS;
+
+        if (m_execution_plan.handle == nullptr) {
+            set_error_and_throw_exception(
+                &m_execution_plan,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: Check and Set the CUDNN_ATTR_EXECUTION_PLAN_HANDLE");
+            return std::move(m_execution_plan);
+        };
+
+        // Create a descriptor. Memory allocation happens here.
+        status = m_execution_plan.initialize_managed_backend_pointer(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_execution_plan, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_execution_plan);
+        }
+
+        std::vector<char> serialization_buf;
+        serialization_buf.assign(json_plan.begin(), json_plan.end());
+        status = detail::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
+                                       CUDNN_TYPE_CHAR,
+                                       serialization_buf.size(),
+                                       serialization_buf.data());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: SetAttribute "
+                                          "CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION Failed");
+            return std::move(m_execution_plan);
+        }
+
+        status = detail::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
+                                       CUDNN_TYPE_HANDLE,
+                                       1,
+                                       &m_execution_plan.handle);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_execution_plan,
+                status,
+                "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: SetAttribute CUDNN_ATTR_EXECUTION_PLAN_HANDLE Failed");
+            return std::move(m_execution_plan);
+        }
+
+        status = detail::finalize(m_execution_plan.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_execution_plan, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed");
+            return std::move(m_execution_plan);
+        }
+
+        m_execution_plan.engine_config = make_shared_backend_pointer(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
+        status                         = m_execution_plan.engine_config->get_status();
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnCreate of "
+                                          "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR failed when computing tag");
+            return std::move(m_execution_plan);
+        }
+
+        cudnnBackendDescriptor_t engCfgDesc = m_execution_plan.engine_config->get_backend_descriptor();
+        int64_t elemCount                   = 0;
+        status                              = detail::get_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &elemCount,
+                                       &engCfgDesc);
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG Failed");
+            return std::move(m_execution_plan);
+        }
+        ManagedOpaqueDescriptor extractedEngine = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+        status                                  = extractedEngine->get_status();
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnCreate of "
+                                          "CUDNN_BACKEND_ENGINE_DESCRIPTOR failed when computing tag");
+            return std::move(m_execution_plan);
+        }
+
+        cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
+
+        status = detail::get_attribute(m_execution_plan.engine_config->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINECFG_ENGINE,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &elemCount,
+                                       &extractedEngine_);
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_execution_plan,
+                                          status,
+                                          "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_ENGINECFG_ENGINE Failed");
+            return std::move(m_execution_plan);
+        }
+
+        m_execution_plan.buildTag(extractedEngine);
+        m_execution_plan.fetchNotes(extractedEngine);
+        m_execution_plan.computeWorkSpaceSize();
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_execution_plan);
+        return std::move(m_execution_plan);
+    }
+
+    explicit ExecutionPlanBuilder_v8()                       = default;
+    ~ExecutionPlanBuilder_v8()                               = default;
+    ExecutionPlanBuilder_v8(ExecutionPlanBuilder_v8 &&)      = delete;
+    ExecutionPlanBuilder_v8(ExecutionPlanBuilder_v8 const &) = delete;
+    ExecutionPlanBuilder_v8 &
+    operator=(ExecutionPlanBuilder_v8 const &) = delete;
+
+   private:
+    ExecutionPlan_v8 m_execution_plan;
+};
+
+using ExecutionPlan        = ExecutionPlan_v8;
+using ExecutionPlanBuilder = ExecutionPlanBuilder_v8;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_ExecutionPlanCache.h b/third_party/cudnn-frontend/include/cudnn_frontend_ExecutionPlanCache.h
new file mode 100644
index 00000000..e41a445e
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_ExecutionPlanCache.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <tuple>
+#include <unordered_map>
+#include <map>
+#include <memory>
+#include <mutex>
+
+#include "cudnn_frontend_OperationGraph.h"
+#include "cudnn_frontend_ExecutionPlan.h"
+
+/// Execution Plan Caching:
+/// Goal is to auto-tune once and then save the best auto-tuned result for a problem for later use.
+/// For every unique Operation Graph (denoted by a string) we have a set of plans identified by a feature vector.
+/// The feature vector could be Tensor dimension/data_type and so on.
+/// Multiple operation Graph can share a feature vector type but may have different Execution Plan(s).
+
+/// The v1 cache has the following format.
+/// It is the reponsibility of the user to query the correct cache for the given device/operation graph combination.
+/***
+ *    device_id_0 Operation_Graph0 (conv_fprop)
+ *    -------------------------------------------------------------------------------
+ *    | Feature_vector_type0_val0   |  Plan A0   |
+ *    | Feature_vector_type0_val1   |  Plan B0   |
+ *    ===============================================================================
+ *
+ *    device_id_0 Operation_Graph1 (dgrad)
+ *    -------------------------------------------------------------------------------
+ *    | Feature_vector_type1_val0   |  Plan A1   |
+ *    | Feature_vector_type1_val1   |  Plan B1   |
+ *    ===============================================================================
+ *
+ *    device_id_0 Operation_Graph2 (wgrad)
+ *    -------------------------------------------------------------------------------
+ *    | Feature_vector_type2_val0   |  Plan B2   |
+ *    ===============================================================================
+ *
+ *    device_id_1 Operation_Graph0 (conv_fprop)
+ *    -------------------------------------------------------------------------------
+ *    | Feature_vector_type0_val0   |  Plan A0   |
+ *    | Feature_vector_type0_val1   |  Plan B0   |
+ *    ===============================================================================
+ *
+ *    device_id_1 Operation_Graph1 (dgrad)
+ *    -------------------------------------------------------------------------------
+ *    | Feature_vector_type1_val0   |  Plan A1   |
+ *    | Feature_vector_type1_val1   |  Plan B1   |
+ *    ===============================================================================
+ *
+ *    device_id_1 Operation_Graph2 (wgrad)
+ *    -------------------------------------------------------------------------------
+ *    | Feature_vector_type2_val0   |  Plan B2   |
+ *    ===============================================================================
+ */
+
+namespace cudnn_frontend {
+
+/// Plan Cache structure for the above table
+class ExecutionPlanCache_v1 {
+   protected:
+    struct compare {
+        bool
+        operator()(const feature_vector_t &fv1, const feature_vector_t &fv2) const {
+            return fv1 < fv2;
+        }
+    };
+
+    std::string name = "plan_cache_[unnamed]";
+
+    /// String to map of feature_vector to execution plan
+    /// For a given FeatureVector of type T according to the Operation Graph, we get the plan.
+    using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t,
+                                            cudnn_frontend::ExecutionPlan,
+                                            cudnn_frontend::ExecutionPlanCache_v1::compare>;
+    FeatureVectorToPlanMap cache;
+
+    mutable std::mutex cache_mutex;
+
+   public:
+    virtual bool
+    is_fastest_plan_stable(const cudnn_frontend::OperationGraph &op_graph, const std::string &tag) {
+        CUDNN_FRONTEND_UNUSED(op_graph);
+        CUDNN_FRONTEND_UNUSED(tag);
+        return true;
+    }
+
+    void
+    add_plan_to_cache(const cudnn_frontend::OperationGraph &op_graph, const cudnn_frontend::ExecutionPlan &plan) {
+        std::lock_guard<std::mutex> guard(cache_mutex);
+        cache.insert(std::make_pair(op_graph.getFeatureVector(), plan));
+        CUDNN_FE_LOG_LABEL_ENDL("Added to " << name << " " << op_graph.getTag());
+    }
+
+    ExecutionPlanCache_v1(const char *name_) { name = name_; }
+
+    const std::string &
+    get_name() const {
+        return name;
+    }
+
+    // Plan is the output here.
+    bool
+    get_plan_from_cache(const cudnn_frontend::OperationGraph &op_graph,
+                        const cudnn_frontend::ExecutionPlan *&plan) const {
+        {
+            std::lock_guard<std::mutex> guard(cache_mutex);
+            auto it = cache.find(op_graph.getFeatureVector());
+
+            if (it == cache.end()) {
+                CUDNN_FE_LOG_LABEL_ENDL("Cached Plan Not Found in " << name);
+                return false;
+            }
+            plan = &(it->second);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL("Cached Plan Found in " << name);
+        return true;
+    }
+
+    virtual ~ExecutionPlanCache_v1() = default;
+};
+
+class ExecutionPlanCache_v2 : public ExecutionPlanCache_v1 {
+    using SaturationTracker = std::map<std::pair<cudnn_frontend::feature_vector_t, std::string>, int32_t>;
+    SaturationTracker tracker;
+
+    int32_t saturationCount = 1;
+
+   public:
+    virtual bool
+    is_fastest_plan_stable(const cudnn_frontend::OperationGraph &op_graph, const std::string &tag) {
+        if (saturationCount == 1) {
+            return true;
+        }  // Special case. Always add to the cache.
+
+        // If plan cache is already created for the op_graph no need to update.
+        // Ideally, one will auto-tune only if the plan cache has no plan for the op_graph.
+        cudnn_frontend::ExecutionPlan const *plan = nullptr;
+        if (get_plan_from_cache(op_graph, plan)) {
+            CUDNN_FE_LOG_LABEL_ENDL("SaturationTracker " << name << " " << op_graph.getTag() << " " << tag
+                                                         << " plan already in cache.");
+            return false;
+        }
+
+        // Lock the cache and increase the count till we saturate
+        std::lock_guard<std::mutex> guard(cache_mutex);
+        auto cnt = tracker[std::make_pair(op_graph.getFeatureVector(), tag)] += 1;
+        CUDNN_FE_LOG_LABEL_ENDL("SaturationTracker " << name << " " << op_graph.getTag() << " " << tag << " " << cnt);
+        return cnt >= saturationCount;
+    }
+
+    void
+    set_saturation_count(int32_t count) {
+        saturationCount = count;
+    }
+
+    ExecutionPlanCache_v2(const char *name_) : ExecutionPlanCache_v1(name_) {}
+
+    virtual ~ExecutionPlanCache_v2() = default;
+};
+
+using ExecutionPlanCache = ExecutionPlanCache_v2;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Filters.h b/third_party/cudnn-frontend/include/cudnn_frontend_Filters.h
new file mode 100644
index 00000000..d16170b0
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Filters.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+namespace cudnn_frontend {
+
+// If filter_fn returns true
+// The engine config will be filtered out and will
+// not be part of the to list.
+static void
+filter(EngineConfigList &from, EngineConfigList &to, std::function<bool(cudnnBackendDescriptor_t)> filter_fn) {
+    auto p = std::stable_partition(from.begin(), from.end(), [filter_fn](ManagedOpaqueDescriptor &p) {
+        return filter_fn(const_cast<cudnnBackendDescriptor_t>(p->get_backend_descriptor()));
+    });
+    // range insert with move
+    to.insert(to.end(), std::make_move_iterator(p), std::make_move_iterator(from.end()));
+    // erase the moved-from elements.
+    from.erase(p, from.end());
+}
+
+template <cudnnBackendNumericalNote_t NUMERIC_NOTE>
+bool
+hasNumericalNote(cudnnBackendDescriptor_t engine_config) {
+    bool hasNumerics                 = false;
+    auto status                      = CUDNN_STATUS_SUCCESS;
+    ManagedOpaqueDescriptor engine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+    cudnnBackendDescriptor_t engine_ = engine->get_backend_descriptor();
+    int64_t engine_count             = -1;
+    status                           = detail::get_attribute(
+        engine_config, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine_count, &engine_);
+    if (status == CUDNN_STATUS_SUCCESS) {
+        cudnnBackendNumericalNote_t notes[CUDNN_NUMERICAL_NOTE_TYPE_COUNT];
+        std::fill_n(notes, CUDNN_NUMERICAL_NOTE_TYPE_COUNT, CUDNN_NUMERICAL_NOTE_TYPE_COUNT);
+        int64_t elem_count = 0;
+        detail::get_attribute(engine->get_backend_descriptor(),
+                              CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                              CUDNN_TYPE_NUMERICAL_NOTE,
+                              CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                              &elem_count,
+                              notes);
+        if (std::any_of(
+                notes, notes + elem_count, [](cudnnBackendNumericalNote_t note) { return note == NUMERIC_NOTE; })) {
+            hasNumerics = true;
+        }
+    }
+    return hasNumerics;
+}
+
+template <cudnnBackendBehaviorNote_t BEHAVIOR_NOTE>
+bool
+hasBehaviorNote(cudnnBackendDescriptor_t engine_config) {
+    bool hasBehavior                 = false;
+    auto status                      = CUDNN_STATUS_SUCCESS;
+    ManagedOpaqueDescriptor engine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+    cudnnBackendDescriptor_t engine_ = engine->get_backend_descriptor();
+    int64_t engine_count             = -1;
+    status                           = detail::get_attribute(
+        engine_config, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine_count, &engine_);
+    if (status == CUDNN_STATUS_SUCCESS) {
+        cudnnBackendBehaviorNote_t notes[CUDNN_BEHAVIOR_NOTE_TYPE_COUNT];
+        std::fill_n(notes, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT);
+        int64_t elem_count = 0;
+        detail::get_attribute(engine->get_backend_descriptor(),
+                              CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                              CUDNN_TYPE_BEHAVIOR_NOTE,
+                              CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                              &elem_count,
+                              notes);
+        if (std::any_of(
+                notes, notes + elem_count, [](cudnnBackendBehaviorNote_t note) { return note == BEHAVIOR_NOTE; })) {
+            hasBehavior = true;
+        }
+    }
+    return hasBehavior;
+}
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Heuristics.h b/third_party/cudnn-frontend/include/cudnn_frontend_Heuristics.h
new file mode 100644
index 00000000..06b4ecc6
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Heuristics.h
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <vector>
+#include <mutex>
+
+#include "cudnn_frontend_OperationGraph.h"
+#include "cudnn_frontend_EngineConfig.h"
+#include "cudnn_frontend_utils.h"
+#include "cudnn_frontend_Filters.h"
+#include "cudnn_frontend/backend/device_properties.h"
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4127)
+#endif
+namespace cudnn_frontend {
+///
+/// Engine Heuristic Class
+/// This class helps determine the engine from the operation graph
+/// based on the heuristics
+/// Properties:
+///    - heuristic mode
+///    - operation graph
+///
+/// Use EngineHeuristicsBuilder_v8 to build this class.
+/// Describe returns a string describing the EngineHeuristics_v8 class
+///
+class EngineHeuristics_v8 : public BackendDescriptor {
+   public:
+    friend class EngineHeuristicsBuilder_v8;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR :";
+        return ss.str();
+    }
+
+    EngineHeuristics_v8(EngineHeuristics_v8 &&from) = default;
+    EngineHeuristics_v8 &
+    operator=(EngineHeuristics_v8 &&from) = default;
+
+    ~EngineHeuristics_v8() = default;
+
+    /** @defgroup EngineHeuristicsQuery
+     *  Query individual property of EngineHeuristics_v8 class
+     *  @{
+     */
+    //! Query the total count of the engines for the Operation Set
+    auto
+    getEngineConfig(int64_t count = 1) -> std::vector<ManagedOpaqueDescriptor> & {
+        cudnnStatus_t status;
+        for (auto i = 0u; i < count; ++i) {
+            ManagedOpaqueDescriptor engConfig = nullptr;
+            engConfig                         = make_shared_backend_pointer(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
+            if (engConfig->is_good() == false) {
+                set_error_and_throw_exception(
+                    this,
+                    engConfig->get_status(),
+                    "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: CUDNN_BACKEND_ENGINECFG_DESCRIPTOR cudnnCreate Failed");
+                return m_heuristic_results;
+            };
+            m_heuristic_results.emplace_back(engConfig);
+        }
+        std::vector<cudnnBackendDescriptor_t> heuristic_results_;
+        for (std::uint32_t i = 0; i < m_heuristic_results.size(); i++) {
+            heuristic_results_.emplace_back(m_heuristic_results[i]->get_backend_descriptor());
+        }
+        int64_t result = -1;
+        status         = detail::get_attribute(pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINEHEUR_RESULTS,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       count,
+                                       &result,
+                                       heuristic_results_.data());
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                this, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: GetAttribute CUDNN_ATTR_ENGINEHEUR_RESULTS Failed");
+        };
+        m_heuristic_results.resize(result);
+        return m_heuristic_results;
+    }
+
+    //! Query the total count of the engine config for the Operation Set
+    auto
+    getEngineConfigCount(void) const -> int64_t {
+        cudnnStatus_t status;
+        int64_t count = -1;
+        status        = detail::get_attribute(pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINEHEUR_RESULTS,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       0,
+                                       &count,
+                                       nullptr);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                this,
+                status,
+                "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: GetAttribute CUDNN_ATTR_ENGINEHEUR_RESULTS Count Failed");
+        };
+        return count;
+    }
+    /** @} */
+
+   private:
+    EngineHeuristics_v8()                            = default;
+    EngineHeuristics_v8(EngineHeuristics_v8 const &) = delete;
+    EngineHeuristics_v8 &
+    operator=(EngineHeuristics_v8 const &) = delete;
+
+    cudnnBackendHeurMode_t mode                               = CUDNN_HEUR_MODE_INSTANT;
+    ManagedOpaqueDescriptor opGraph                           = nullptr;
+    std::shared_ptr<const DeviceProperties> device_properties = nullptr;
+    std::vector<ManagedOpaqueDescriptor> m_heuristic_results;  //! storage of heuristic results
+    std::string opGraphTag;
+    int32_t target_sm_count = -1;
+
+    static std::mutex &
+    get_heur_b_mutex() {
+        static std::mutex heur_b_mutex;
+        return heur_b_mutex;
+    }
+};
+
+///
+/// EngineHeuristicsBuilder_v8 Class
+/// Helper class used to build EngineHeuristics_v8 class
+class EngineHeuristicsBuilder_v8 {
+   public:
+    /** @defgroup EngineHeuristicsBuilder_v8
+     *  Set individual property of EngineHeuristics_v8 class
+     *  @{
+     */
+    //! Set operationGraph for the engine (opGraph is not destroyed)
+    auto
+    setOperationGraph(OperationGraph_v8 &opGraph_) -> EngineHeuristicsBuilder_v8 & {
+        m_heuristics.opGraph    = opGraph_.get_desc();
+        m_heuristics.opGraphTag = opGraph_.getTag();
+        return *this;
+    }
+    auto
+    setOperationGraph(ManagedOpaqueDescriptor opGraph, std::string tag) -> EngineHeuristicsBuilder_v8 & {
+        m_heuristics.opGraph    = opGraph;
+        m_heuristics.opGraphTag = tag;
+        return *this;
+    }
+
+    auto
+    setDeviceProperties(std::shared_ptr<const DeviceProperties> device_properties) -> EngineHeuristicsBuilder_v8 & {
+        m_heuristics.device_properties = device_properties;
+        return *this;
+    }
+
+    auto
+    setHeurMode(cudnnBackendHeurMode_t mode_) -> EngineHeuristicsBuilder_v8 & {
+        m_heuristics.mode = mode_;
+        return *this;
+    }
+
+    auto
+    setSMCount(int32_t sm_count_) -> EngineHeuristicsBuilder_v8 & {
+        m_heuristics.target_sm_count = sm_count_;
+        return *this;
+    }
+    /** @} */
+
+    //! constructs the EngineHeuristics_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    EngineHeuristics_v8 &&
+    build() {
+        if (m_heuristics.opGraph == nullptr) {
+            set_error_and_throw_exception(&m_heuristics,
+                                          CUDNN_STATUS_BAD_PARAM,
+                                          "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: Check and Set the "
+                                          "CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH field for heuristic");
+            return std::move(m_heuristics);
+        };
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_heuristics.initialize_managed_backend_pointer(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_heuristics, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_heuristics);
+        };
+
+        status = detail::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_heuristics.opGraph->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_heuristics,
+                status,
+                "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute  CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH Failed");
+            return std::move(m_heuristics);
+        };
+
+        status = detail::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_ENGINEHEUR_MODE,
+                                       CUDNN_TYPE_HEUR_MODE,
+                                       1,
+                                       &m_heuristics.mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_heuristics,
+                status,
+                "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINEHEUR_MODE Failed");
+            return std::move(m_heuristics);
+        };
+
+        if (m_heuristics.device_properties != nullptr) {
+#if (CUDNN_VERSION >= 90800)
+            status = detail::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_ENGINEHEUR_DEVICEPROP,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &m_heuristics.device_properties->get_ptr());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_heuristics,
+                                              status,
+                                              "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute "
+                                              "CUDNN_ATTR_ENGINEHEUR_DEVICEPROP Failed");
+                return std::move(m_heuristics);
+            }
+#endif
+        }
+
+#if (CUDNN_VERSION >= 8905)
+        if (m_heuristics.target_sm_count >= 0) {
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(8905,
+                                                         m_heuristics,
+                                                         "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute "
+                                                         "CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET requires cudnn "
+                                                         "version 8.9.5");
+            status = detail::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET,
+                                           CUDNN_TYPE_INT32,
+                                           1,
+                                           &m_heuristics.target_sm_count);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_heuristics,
+                    status,
+                    "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET Failed");
+                return std::move(m_heuristics);
+            };
+        }
+#endif
+
+        if (m_heuristics.mode == CUDNN_HEUR_MODE_B) {
+            EngineHeuristics_v8::get_heur_b_mutex().lock();
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_heuristics.pointer->get_backend_descriptor());
+
+        if (m_heuristics.mode == CUDNN_HEUR_MODE_B) {
+            EngineHeuristics_v8::get_heur_b_mutex().unlock();
+        }
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_heuristics, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: cudnn Finalize failed");
+            return std::move(m_heuristics);
+        };
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_heuristics);
+        return std::move(m_heuristics);
+    }
+
+    explicit EngineHeuristicsBuilder_v8()                          = default;
+    ~EngineHeuristicsBuilder_v8()                                  = default;
+    EngineHeuristicsBuilder_v8(EngineHeuristicsBuilder_v8 &&)      = delete;
+    EngineHeuristicsBuilder_v8(EngineHeuristicsBuilder_v8 const &) = delete;
+    EngineHeuristicsBuilder_v8 &
+    operator=(EngineHeuristicsBuilder_v8 const &) = delete;
+
+   private:
+    EngineHeuristics_v8 m_heuristics;
+};
+
+template <std::size_t SIZE>
+EngineConfigList
+get_heuristics_list(std::array<cudnnBackendHeurMode_t, SIZE> modes,
+                    OperationGraph_v8 &opGraph,
+                    std::function<bool(cudnnBackendDescriptor_t)> filter_fn) {
+    CUDNN_FRONTEND_UNUSED(modes);
+    EngineConfigList filtered_configs;
+
+    for (auto mode : modes) {
+        if (mode == CUDNN_HEUR_MODES_COUNT) {
+            continue;
+        }
+        auto heuristics = EngineHeuristicsBuilder_v8().setOperationGraph(opGraph).setHeurMode(mode).build();
+        CUDNN_FE_LOG_LABEL_ENDL("Heuristic Mode " << mode << " has " << heuristics.getEngineConfigCount()
+                                                  << " configurations.");
+        auto &engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+        cudnn_frontend::filter(engine_config, filtered_configs, filter_fn);
+    }
+    return filtered_configs;
+}
+
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+#define NV_CUDNN_FE_TRY() try {
+#else
+#define NV_CUDNN_FE_TRY()
+#endif
+
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+#define NV_CUDNN_FE_CATCH(...)                   \
+    }                                            \
+    catch (cudnn_frontend::cudnnException & e) { \
+        __VA_ARGS__;                             \
+    }
+#else
+#define NV_CUDNN_FE_CATCH(...)
+#endif
+
+#define NV_CUDNN_RETURN_IF_ERROR(heuristics)                   \
+    do {                                                       \
+        if (heuristics.get_status() != CUDNN_STATUS_SUCCESS) { \
+            return heuristics.get_status();                    \
+        }                                                      \
+    } while (0);
+
+#define NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(status, is_last_status) \
+    if (is_last_status || status != CUDNN_STATUS_SUCCESS) {           \
+        statuses.push_back(status);                                   \
+        if (status == CUDNN_STATUS_SUCCESS || evaluate_all) {         \
+            continue;                                                 \
+        }                                                             \
+        break;                                                        \
+    }
+
+static inline cudnnStatus_t
+get_heuristics_list_impl(cudnnBackendHeurMode_t heur_mode,
+                         OperationGraph_v8 &opGraph,
+                         std::function<bool(cudnnBackendDescriptor_t)> filter_fn,
+                         int32_t sm_count,
+                         EngineConfigList &filtered_configs,
+                         std::shared_ptr<const DeviceProperties> device_properties = nullptr) {
+    auto heuristics = EngineHeuristicsBuilder_v8()
+                          .setDeviceProperties(device_properties)
+                          .setOperationGraph(opGraph)
+                          .setHeurMode(heur_mode)
+                          .setSMCount(sm_count)
+                          .build();
+    NV_CUDNN_RETURN_IF_ERROR(heuristics);
+    auto num_config = heuristics.getEngineConfigCount();
+    NV_CUDNN_RETURN_IF_ERROR(heuristics);
+    CUDNN_FE_LOG_LABEL_ENDL("Heuristic query for mode " << heur_mode << " has " << num_config << " configurations.");
+    auto &engine_config = heuristics.getEngineConfig(num_config);
+    NV_CUDNN_RETURN_IF_ERROR(heuristics);
+    CUDNN_FE_LOG_LABEL_ENDL("Backend heuristics recommendation count: " << engine_config.size());
+    cudnn_frontend::filter(engine_config, filtered_configs, filter_fn);
+    return CUDNN_STATUS_SUCCESS;
+}
+
+static inline std::vector<cudnnStatus_t>
+get_heuristics_list(std::vector<std::string> const &modes,
+                    OperationGraph_v8 &opGraph,
+                    std::function<bool(cudnnBackendDescriptor_t)> filter_fn,
+                    EngineConfigList &filtered_configs,
+                    bool evaluate_all                                         = false,
+                    int32_t sm_count                                          = -1,
+                    std::shared_ptr<const DeviceProperties> device_properties = nullptr) {
+    std::vector<cudnnStatus_t> statuses;
+
+    // Try building the heuristics for each mode
+    // if fails push the status in the list of statuses
+    for (auto &mode : modes) {
+        if (mode.find("heuristics_instant") != std::string::npos ||
+            mode.find("heuristics_mode_a") != std::string::npos) {
+            auto heur_mode = CUDNN_HEUR_MODE_A;
+            NV_CUDNN_FE_TRY();
+            auto status_l =
+                get_heuristics_list_impl(heur_mode, opGraph, filter_fn, sm_count, filtered_configs, device_properties);
+            NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(status_l, true);
+            NV_CUDNN_FE_CATCH(NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(e.getCudnnStatus(), true));
+
+        } else if (mode.find("heuristics_fallback") != std::string::npos) {
+            NV_CUDNN_FE_TRY();
+            auto status_l = get_heuristics_list_impl(
+                CUDNN_HEUR_MODE_FALLBACK, opGraph, filter_fn, sm_count, filtered_configs, device_properties);
+            NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(status_l, true);
+            NV_CUDNN_FE_CATCH(NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(e.getCudnnStatus(), true));
+        } else if (mode.find("heuristics_mode_b") != std::string::npos) {
+            auto heur_mode = CUDNN_HEUR_MODE_B;
+            NV_CUDNN_FE_TRY();
+            auto status_l =
+                get_heuristics_list_impl(heur_mode, opGraph, filter_fn, sm_count, filtered_configs, device_properties);
+
+            // Between cudnn version 8.3 and 8.6, when heur_mode_b heuristics did not succeed,
+            // there was no fallback to the instant mode. We are here manually adding instant mode
+            // to the heur_mode_b to alleviate this issue.
+#if (CUDNN_VERSION >= 8300) && (CUDNN_VERSION < 8600)
+            if (status_l != CUDNN_STATUS_SUCCESS) {
+                status_l = get_heuristics_list_impl(
+                    CUDNN_HEUR_MODE_INSTANT, opGraph, filter_fn, sm_count, filtered_configs, device_properties);
+            }
+#endif
+            NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(status_l, true);
+#if (CUDNN_VERSION >= 8300) && (CUDNN_VERSION < 8600)
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        }
+        catch (cudnn_frontend::cudnnException &) {
+            NV_CUDNN_FE_TRY();
+            auto status_ =
+                get_heuristics_list_impl(heur_mode, opGraph, filter_fn, sm_count, filtered_configs, device_properties);
+            statuses.push_back(status_);
+            NV_CUDNN_FE_CATCH(NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(e.getCudnnStatus(), true));
+        }
+#endif
+#else
+            NV_CUDNN_FE_CATCH(NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(e.getCudnnStatus(), true));
+#endif
+    }
+}
+return statuses;
+}
+
+static inline std::vector<cudnnStatus_t>
+get_heuristics_list(std::vector<cudnn_frontend::HeurMode_t> const &modes,
+                    OperationGraph_v8 &opGraph,
+                    std::function<bool(cudnnBackendDescriptor_t)> filter_fn,
+                    EngineConfigList &filtered_configs,
+                    bool evaluate_all                                         = false,
+                    int32_t sm_count                                          = -1,
+                    std::shared_ptr<const DeviceProperties> device_properties = nullptr) {
+    std::unordered_map<HeurMode_t, std::string> mode_to_string = {
+        {HeurMode_t::A, "heuristics_mode_a"},
+        {HeurMode_t::B, "heuristics_mode_b"},
+        {HeurMode_t::FALLBACK, "heuristics_fallback"},
+    };
+
+    std::vector<std::string> string_modes(modes.size());
+    std::transform(modes.begin(), modes.end(), string_modes.begin(), [&mode_to_string](const auto &mode) {
+        return mode_to_string.at(mode);
+    });
+
+    return get_heuristics_list(
+        string_modes, opGraph, filter_fn, filtered_configs, evaluate_all, sm_count, device_properties);
+}
+
+template <std::size_t SIZE>
+std::vector<cudnnStatus_t>
+get_heuristics_list(std::array<std::string, SIZE> modes,
+                    OperationGraph_v8 &opGraph,
+                    std::function<bool(cudnnBackendDescriptor_t)> filter_fn,
+                    EngineConfigList &filtered_configs,
+                    bool evaluate_all = false) {
+    std::vector<std::string> modes_vector(modes.begin(), modes.end());
+    return get_heuristics_list(modes_vector, opGraph, filter_fn, filtered_configs, evaluate_all);
+}
+
+#undef NV_CUDNN_FE_TRY
+#undef NV_CUDNN_FE_CATCH
+#undef NV_CUDNN_RETURN_IF_ERROR
+#undef NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE
+
+using EngineHeuristicsBuilde = EngineHeuristicsBuilder_v8;
+using EngineHeuristics       = EngineHeuristics_v8;
+}  // namespace cudnn_frontend
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Logging.h b/third_party/cudnn-frontend/include/cudnn_frontend_Logging.h
new file mode 100644
index 00000000..687f8f53
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Logging.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <cstring>
+
+namespace cudnn_frontend {
+
+static const char *
+get_environment(const char *name) {
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+    return std::getenv(name);
+}
+
+inline bool &
+isLoggingEnabled() {
+#ifdef NV_CUDNN_FRONTEND_DISABLE_LOGGING
+    static bool log_enabled = false;
+#else
+    static bool log_enabled =
+        get_environment("CUDNN_FRONTEND_LOG_INFO") && std::strncmp(get_environment("CUDNN_FRONTEND_LOG_INFO"), "0", 1);
+#endif
+    return log_enabled;
+}
+
+inline std::ostream &
+getStream() {
+    static std::ofstream outFile;
+    static std::ostream &stream =
+        get_environment("CUDNN_FRONTEND_LOG_FILE")
+            ? (std::strncmp(get_environment("CUDNN_FRONTEND_LOG_FILE"), "stdout", 6) == 0
+                   ? std::cout
+                   : (std::strncmp(get_environment("CUDNN_FRONTEND_LOG_FILE"), "stderr", 6) == 0
+                          ? std::cerr
+                          : (outFile.open(get_environment("CUDNN_FRONTEND_LOG_FILE"), std::ios::out), outFile)))
+            : (isLoggingEnabled() = false, std::cout);
+    return stream;
+}
+
+class ConditionalStreamer {
+   private:
+    std::ostream &stream;
+
+   public:
+    ConditionalStreamer(std::ostream &stream_) : stream(stream_) {}
+
+    template <typename T>
+    const ConditionalStreamer &
+    operator<<(const T &t) const {
+        if (isLoggingEnabled()) {
+            stream << t;
+        }
+        return *this;
+    }
+
+    const ConditionalStreamer &
+    operator<<(std::ostream &(*spl)(std::ostream &)) const {
+        if (isLoggingEnabled()) {
+            stream << spl;
+        }
+        return *this;
+    }
+};
+
+inline ConditionalStreamer &
+getLogger() {
+    static ConditionalStreamer opt(getStream());
+    return opt;
+}
+
+#define CUDNN_FE_LOG(X)           \
+    do {                          \
+        if (isLoggingEnabled()) { \
+            getLogger() << X;     \
+        }                         \
+    } while (0);
+
+#define CUDNN_FE_LOG_LABEL(X)                        \
+    do {                                             \
+        if (isLoggingEnabled()) {                    \
+            getLogger() << "[cudnn_frontend] " << X; \
+        }                                            \
+    } while (0);
+
+#define CUDNN_FE_LOG_LABEL_ENDL(X)                                \
+    do {                                                          \
+        if (isLoggingEnabled()) {                                 \
+            getLogger() << "[cudnn_frontend] " << X << std::endl; \
+        }                                                         \
+    } while (0);
+
+#define CUDNN_FE_LOG_BANNER(X)                                                         \
+    do {                                                                               \
+        if (isLoggingEnabled()) {                                                      \
+            {                                                                          \
+                constexpr int total_width = 128;                                       \
+                std::ostringstream oss;                                                \
+                oss << "[cudnn_frontend] ||| === " << X << " === |||";                 \
+                std::string banner_line = oss.str();                                   \
+                int banner_len          = static_cast<int>(banner_line.size());        \
+                int pad                 = total_width - banner_len;                    \
+                if (pad > 0) {                                                         \
+                    banner_line.insert(banner_line.size() - 5, std::string(pad, ' ')); \
+                }                                                                      \
+                getLogger() << std::string(total_width, '=') << std::endl;             \
+                getLogger() << banner_line << std::endl;                               \
+                getLogger() << std::string(total_width, '=') << std::endl;             \
+            }                                                                          \
+        }                                                                              \
+    } while (0);
+
+static std::ostream &
+operator<<(std::ostream &os, const BackendDescriptor &desc) {
+    if (isLoggingEnabled()) {
+        os << desc.describe();
+    }
+    return os;
+}
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_MatMulDesc.h b/third_party/cudnn-frontend/include/cudnn_frontend_MatMulDesc.h
new file mode 100644
index 00000000..ee57403f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_MatMulDesc.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace cudnn_frontend {
+namespace graph {
+class MatmulNode;
+}
+}  // namespace cudnn_frontend
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+///
+/// MatMulDesc  Descriptor Class
+/// This class tells the properties of the MatMul operation
+/// Properties:
+///    - compute_type
+///
+/// Use MatMulDesc_v8 to build this class.
+/// Describe returns a string describing the MatMul operation
+///
+class MatMulDesc_v8 : public BackendDescriptor {
+   public:
+    friend class MatMulDescBuilder_v8;
+    friend class cudnn_frontend::graph::MatmulNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "CUDNN_BACKEND_MATMUL_DESCRIPTOR :" << " Math precision " << json{compute_type};
+#else
+        ss << "CUDNN_BACKEND_MATMUL_DESCRIPTOR :" << " Math precision " << int(compute_type);
+#endif
+        return ss.str();
+    }
+
+    MatMulDesc_v8(MatMulDesc_v8 &&from) = default;
+    MatMulDesc_v8 &
+    operator=(MatMulDesc_v8 &&from) = default;
+
+    ~MatMulDesc_v8() = default;
+
+   private:
+    MatMulDesc_v8()                      = default;
+    MatMulDesc_v8(MatMulDesc_v8 const &) = delete;
+    MatMulDesc_v8 &
+    operator=(MatMulDesc_v8 const &) = delete;
+
+    DataType_t compute_type = DataType_t::NOT_SET;
+    bool isPadded           = false;
+    double paddingValue     = 0.0;
+};
+
+////
+/// MatMulDescBuilder_v8 Class
+/// Helper class used to build MatMulDesc_v8 class
+class MatMulDescBuilder_v8 {
+   public:
+    /** @defgroup MatMulDescBuilder_v8
+     *  Set individual property of MatMulDesc_v8 class
+     *  @{
+     */
+    //! Set Math Precision Data Type for the Matmul Operation
+    auto
+    setComputeType(DataType_t data_type_) -> MatMulDescBuilder_v8 & {
+        m_matMulDesc.compute_type = data_type_;
+        return *this;
+    }
+    auto
+    setComputeType(cudnnDataType_t data_type_) -> MatMulDescBuilder_v8 & {
+        m_matMulDesc.compute_type = detail::convert_from_cudnn_type(data_type_);
+        return *this;
+    }
+    /** @} */
+
+    // TODO Deprecate in v1.0
+    auto
+    setMathPrecision(cudnnDataType_t data_type_) -> MatMulDescBuilder_v8 & {
+        return setComputeType(data_type_);
+    }
+
+    //! Set padding value for matmul descriptor
+    auto
+    setPaddingValue(double paddingValue) -> MatMulDescBuilder_v8 & {
+        m_matMulDesc.isPadded     = true;
+        m_matMulDesc.paddingValue = paddingValue;
+        return *this;
+    }
+
+    //! constructs the MatMulDesc_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    MatMulDesc_v8 &&
+    build() {
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_matMulDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_MATMUL_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_matMulDesc, status, "CUDNN_BACKEND_MATMUL_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_matMulDesc);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        cudnnDataType_t cudnn_data_type;
+        status = detail::convert_to_cudnn_type(m_matMulDesc.compute_type, cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_matMulDesc,
+                status,
+                "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_COMP_TYPE Failed");
+            return std::move(m_matMulDesc);
+        }
+        status = detail::set_attribute(m_matMulDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_MATMUL_COMP_TYPE,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_matMulDesc,
+                status,
+                "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_COMP_TYPE Failed");
+            return std::move(m_matMulDesc);
+        }
+
+#if (CUDNN_VERSION >= 8900)
+        // Setting padding value if matmul desc is padded
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8900,
+            m_matMulDesc,
+            "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_PADDING_VALUE requires cudnn 8.9.0");
+        if (m_matMulDesc.isPadded) {
+            status = detail::set_attribute(m_matMulDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_MATMUL_PADDING_VALUE,
+                                           CUDNN_TYPE_DOUBLE,
+                                           1,
+                                           &m_matMulDesc.paddingValue);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_matMulDesc,
+                    status,
+                    "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_PADDING_VALUE Failed");
+                return std::move(m_matMulDesc);
+            }
+        }
+#endif
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_matMulDesc.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_matMulDesc, status, "CUDNN_BACKEND_MATMUL_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_matMulDesc);
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_matMulDesc);
+        return std::move(m_matMulDesc);
+    }
+
+    explicit MatMulDescBuilder_v8()                    = default;
+    ~MatMulDescBuilder_v8()                            = default;
+    MatMulDescBuilder_v8(MatMulDescBuilder_v8 &&)      = delete;
+    MatMulDescBuilder_v8(MatMulDescBuilder_v8 const &) = delete;
+    MatMulDescBuilder_v8 &
+    operator=(MatMulDescBuilder_v8 const &) = delete;
+
+   private:
+    MatMulDesc_v8 m_matMulDesc;
+};
+using MatMulDesc        = MatMulDesc_v8;
+using MatMulDescBuilder = MatMulDescBuilder_v8;
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Operation.h b/third_party/cudnn-frontend/include/cudnn_frontend_Operation.h
new file mode 100644
index 00000000..6bf01019
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Operation.h
@@ -0,0 +1,3076 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace cudnn_frontend {
+namespace graph {
+class ReductionNode;
+class PointwiseNode;
+class MatmulNode;
+class ConvolutionNode;
+class DgradNode;
+class WgradNode;
+class LayerNormNode;
+class BatchNormNode;
+class BatchnormInferenceNode;
+class RMSNormNode;
+class DRMSNormNode;
+class InstanceNormNode;
+class DINNode;
+class DLNNode;
+class DBNNode;
+class DBNWeightNode;
+class BatchNormFinalizeNode;
+class GenstatsNode;
+class ReshapeNode;
+class ResampleNode;
+class RngNode;
+class PagedCacheLoadNode;
+}  // namespace graph
+}  // namespace cudnn_frontend
+
+#include <array>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_ConvDesc.h"
+#include "cudnn_frontend_PointWiseDesc.h"
+#include "cudnn_frontend_MatMulDesc.h"
+#include "cudnn_frontend_ReductionDesc.h"
+#include "cudnn_frontend_Resample.h"
+#include "cudnn_frontend_Rng.h"
+#include "cudnn_frontend_Tensor.h"
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+///
+/// Operation_v8 Class
+/// This class has the properties of the operation
+/// Properties:
+///    - xDesc
+///    - yDesc
+///    - wdesc
+///    - bdesc
+///    - tDesc
+///    - dydesc
+///    - dxdesc
+///    - cdesc
+///    - amatdesc
+///    - bmatdesc
+///    - cmatdesc
+///    - moverridedesc
+///    - noverridedesc
+///    - koverridedesc
+///    - pwdesc
+///    - matmuldesc
+///    - reductiondesc
+///    - flagdesc
+///    - inputDescs
+///    - alpha
+///    - beta
+///    - alpha2
+///    - axis
+///    - inplaceIndex
+///    - mode
+///    - value
+///
+/// Use OperationBuilder_v8 to build this class.
+/// Describe returns a string describing the convolution operation
+///
+class Operation_v8 : public BackendDescriptor {
+   public:
+    friend class OperationBuilder_v8;
+    friend class cudnn_frontend::graph::ReductionNode;
+    friend class cudnn_frontend::graph::PointwiseNode;
+    friend class cudnn_frontend::graph::MatmulNode;
+    friend class cudnn_frontend::graph::ConvolutionNode;
+    friend class cudnn_frontend::graph::DgradNode;
+    friend class cudnn_frontend::graph::WgradNode;
+    friend class cudnn_frontend::graph::LayerNormNode;
+    friend class cudnn_frontend::graph::BatchNormNode;
+    friend class cudnn_frontend::graph::BatchnormInferenceNode;
+    friend class cudnn_frontend::graph::RMSNormNode;
+    friend class cudnn_frontend::graph::DRMSNormNode;
+    friend class cudnn_frontend::graph::InstanceNormNode;
+    friend class cudnn_frontend::graph::DINNode;
+    friend class cudnn_frontend::graph::DLNNode;
+    friend class cudnn_frontend::graph::DBNNode;
+    friend class cudnn_frontend::graph::DBNWeightNode;
+    friend class cudnn_frontend::graph::BatchNormFinalizeNode;
+    friend class cudnn_frontend::graph::GenstatsNode;
+    friend class cudnn_frontend::graph::ReshapeNode;
+    friend class cudnn_frontend::graph::ResampleNode;
+    friend class cudnn_frontend::graph::RngNode;
+    friend class cudnn_frontend::graph::PagedCacheLoadNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_OPERATION :" << " OpMode: " << op_mode;
+        ss << std::hex << " X " << xdesc;
+        ss << std::hex << " Y " << ydesc;
+        ss << std::hex << " W " << wdesc;
+        ss << std::hex << " B " << bdesc;
+        ss << std::hex << " T " << tdesc;
+        ss << std::hex << " DW " << dwdesc;
+        ss << std::hex << " DY " << dydesc;
+        ss << std::hex << " DX " << dxdesc;
+        ss << std::hex << " C " << cdesc;
+        ss << std::hex << " A Mtrix " << amatdesc;
+        ss << std::hex << " B Mtrix " << bmatdesc;
+        ss << std::hex << " C Mtrix " << cmatdesc;
+        ss << std::hex << " P " << pwdesc;
+        ss << std::hex << " MatMul " << matmuldesc;
+        ss << std::hex << " Reduction " << reductiondesc;
+        ss << std::dec << " alphabetaType " << alphabetaType;
+        ss << " Alpha: " << alpha_s << " " << alpha_d;
+        ss << " Alpha2: " << alpha2_s << " " << alpha2_d;
+        ss << " Beta: " << beta_s << " " << beta_d;
+        return ss.str();
+    }
+
+    Operation_v8(Operation_v8 &&from) = default;
+    Operation_v8 &
+    operator=(Operation_v8 &&from) = default;
+
+    // Will be deprecated. Do Not use
+    ManagedOpaqueDescriptor
+    getOutputTensor() {
+        return (op_mode == DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR) ? cmatdesc : ydesc;
+    }
+
+    std::string const &
+    getTag() const {
+        return operationTag;
+    }
+
+    feature_vector_t
+    getFeatureVector() const {
+        return feature_vector;
+    }
+
+    ~Operation_v8() = default;
+
+   private:
+    Operation_v8()                     = default;
+    Operation_v8(Operation_v8 const &) = delete;
+    Operation_v8 &
+    operator=(Operation_v8 const &) = delete;
+
+    DescriptorType_t op_mode = DescriptorType_t::NOT_SET;
+
+    ManagedOpaqueDescriptor xdesc              = nullptr;
+    ManagedOpaqueDescriptor ydesc              = nullptr;
+    ManagedOpaqueDescriptor wdesc              = nullptr;
+    ManagedOpaqueDescriptor bdesc              = nullptr;
+    ManagedOpaqueDescriptor tdesc              = nullptr;
+    ManagedOpaqueDescriptor dydesc             = nullptr;
+    ManagedOpaqueDescriptor dxdesc             = nullptr;
+    ManagedOpaqueDescriptor dwdesc             = nullptr;
+    ManagedOpaqueDescriptor cdesc              = nullptr;
+    ManagedOpaqueDescriptor resampledesc       = nullptr;
+    ManagedOpaqueDescriptor rngdesc            = nullptr;
+    ManagedOpaqueDescriptor amatdesc           = nullptr;
+    ManagedOpaqueDescriptor bmatdesc           = nullptr;
+    ManagedOpaqueDescriptor cmatdesc           = nullptr;
+    ManagedOpaqueDescriptor moverridedesc      = nullptr;
+    ManagedOpaqueDescriptor noverridedesc      = nullptr;
+    ManagedOpaqueDescriptor koverridedesc      = nullptr;
+    ManagedOpaqueDescriptor pwdesc             = nullptr;
+    ManagedOpaqueDescriptor matmuldesc         = nullptr;
+    ManagedOpaqueDescriptor reductiondesc      = nullptr;
+    ManagedOpaqueDescriptor sumdesc            = nullptr;
+    ManagedOpaqueDescriptor sqsumdesc          = nullptr;
+    ManagedOpaqueDescriptor scaledesc          = nullptr;
+    ManagedOpaqueDescriptor biasdesc           = nullptr;
+    ManagedOpaqueDescriptor dscaledesc         = nullptr;
+    ManagedOpaqueDescriptor dbiasdesc          = nullptr;
+    ManagedOpaqueDescriptor eqscaledesc        = nullptr;
+    ManagedOpaqueDescriptor eqscaledesc1       = nullptr;
+    ManagedOpaqueDescriptor eqbiasdesc         = nullptr;
+    ManagedOpaqueDescriptor prevMeandesc       = nullptr;
+    ManagedOpaqueDescriptor prevVardesc        = nullptr;
+    ManagedOpaqueDescriptor nextMeandesc       = nullptr;
+    ManagedOpaqueDescriptor nextVardesc        = nullptr;
+    ManagedOpaqueDescriptor savedMeandesc      = nullptr;
+    ManagedOpaqueDescriptor savedInVardesc     = nullptr;
+    ManagedOpaqueDescriptor accumCountdesc     = nullptr;
+    ManagedOpaqueDescriptor epsilondesc        = nullptr;
+    ManagedOpaqueDescriptor expDecayFactordesc = nullptr;
+    ManagedOpaqueDescriptor idxdesc            = nullptr;
+    ManagedOpaqueDescriptor offsetdesc         = nullptr;
+    ManagedOpaqueDescriptor seeddesc           = nullptr;
+    ManagedOpaqueDescriptor containerdesc      = nullptr;
+    ManagedOpaqueDescriptor pageTabledesc      = nullptr;
+    ManagedOpaqueDescriptor sequencedesc       = nullptr;
+    std::vector<ManagedOpaqueDescriptor> peerStatdescs;
+
+    cudnnBackendAttributeType_t alphabetaType = CUDNN_TYPE_FLOAT;
+    cudnnDataType_t compute_type              = CUDNN_DATA_FLOAT;
+    cudnnGenStatsMode_t genstats_mode         = CUDNN_GENSTATS_SUM_SQSUM;
+    cudnnBnFinalizeStatsMode_t bn_stats_mode  = CUDNN_BN_FINALIZE_STATISTICS_TRAINING;
+
+    NormFwdPhase_t norm_fwd_phase;
+    NormMode_t norm_mode;
+
+    float alpha_s = 1.0f, beta_s = .0f, alpha2_s = 1.0f;
+    double alpha_d = 1.0, beta_d = 0.0, alpha2_d = 1.0;
+    int64_t pointwise_port_count        = -1;
+    PointwiseMode_t pointwise_mode      = PointwiseMode_t::NOT_SET;
+    bool is_pointwise_activation_fwd_op = false;
+    bool is_pointwise_identity_op       = false;
+    bool is_pointwise_activation_bwd_op = false;
+    bool is_pointwise_math_op           = false;
+    std::string operationTag;
+    feature_vector_t feature_vector;
+    int64_t seed = 0;
+};
+
+///
+/// OperationBuilder_v8 Class
+/// Helper class used to build Operation_v8 class
+
+class OperationBuilder_v8 {
+   private:
+    Operation_v8 m_operation;
+    bool is_convolution_op      = false;
+    bool is_pointwise_op        = false;
+    bool is_matmul_op           = false;
+    bool is_reduction_op        = false;
+    bool is_genstats_op         = false;
+    bool is_bn_finalize_op      = false;
+    bool is_resample_fwd_op     = false;
+    bool is_resample_bwd_op     = false;
+    bool is_norm_forward_op     = false;
+    bool is_norm_backward_op    = false;
+    bool is_bn_bwd_weight       = false;
+    bool is_rng_op              = false;
+    bool is_reshape_op          = false;
+    bool is_paged_cache_load_op = false;
+
+    using Message_t = const char *;
+
+    int64_t xTensor_dimA[CUDNN_DIM_MAX + 1];
+    int64_t xTensor_strA[CUDNN_DIM_MAX + 1];
+    int64_t wTensor_dimA[CUDNN_DIM_MAX + 1];
+    int64_t wTensor_strA[CUDNN_DIM_MAX + 1];
+    int64_t yTensor_dimA[CUDNN_DIM_MAX + 1];
+    int64_t yTensor_strA[CUDNN_DIM_MAX + 1];
+    int64_t idxTensor_dimA[CUDNN_DIM_MAX + 1];
+    int64_t idxTensor_strA[CUDNN_DIM_MAX + 1];
+
+    bool is2D = true;
+
+    int64_t conv_padding[CUDNN_DIM_MAX];
+    int64_t conv_dilation[CUDNN_DIM_MAX];
+    int64_t conv_stride[CUDNN_DIM_MAX];
+    int64_t mode;
+    int64_t xType, yType, wType, cType, idxType /* compute_precision */;
+
+    int64_t tensor_dims = 0;
+
+    Operation_v8 &&
+    build_reduction_op() {
+        m_operation.operationTag = "Reduction";
+        auto status              = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                            CUDNN_ATTR_OPERATION_REDUCTION_DESC,
+                                            CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                            1,
+                                            &(m_operation.reductiondesc->get_backend_descriptor()));
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_REDUCTION_DESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_REDUCTION_XDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.ydesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_REDUCTION_YDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_matmul_op() {
+        m_operation.operationTag = "Matmul";
+        auto status              = CUDNN_STATUS_SUCCESS;
+        status                   = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_MATMUL_ADESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.amatdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_ADESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_MATMUL_BDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.bmatdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_BDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_MATMUL_CDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.cmatdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_CDESC Failed");
+            return std::move(m_operation);
+        }
+#if (CUDNN_VERSION >= 8700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_operation, "CUDNN_BACKEND_OPERATION: M,N,K override Requires cudnn 8.7.0 and above");
+        if (m_operation.moverridedesc != nullptr) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.moverridedesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC Failed");
+                return std::move(m_operation);
+            }
+        }
+        if (m_operation.noverridedesc != nullptr) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.noverridedesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC Failed");
+                return std::move(m_operation);
+            }
+        }
+        if (m_operation.koverridedesc != nullptr) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.koverridedesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC Failed");
+                return std::move(m_operation);
+            }
+        }
+#endif
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_MATMUL_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.matmuldesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_DESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_pointwise_op() {
+        auto status = CUDNN_STATUS_SUCCESS;
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        json j                   = m_operation.pointwise_mode;
+        m_operation.operationTag = j;
+#else
+        m_operation.operationTag = std::to_string((int)m_operation.pointwise_mode);
+#endif
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.pwdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_POINTWISE_XDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_XDESC Failed");
+            return std::move(m_operation);
+        }
+
+        if (!m_operation.is_pointwise_activation_bwd_op) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_POINTWISE_YDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.ydesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_YDESC Failed");
+                return std::move(m_operation);
+            }
+        } else {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_POINTWISE_DYDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.dydesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_DYDESC Failed");
+                return std::move(m_operation);
+            }
+
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_POINTWISE_DXDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.dxdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_DXDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+
+        void *alpha  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.alpha_s)
+                                                                      : static_cast<void *>(&m_operation.alpha_d));
+        void *alpha2 = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.alpha2_s)
+                                                                      : static_cast<void *>(&m_operation.alpha2_d));
+        status       = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       alpha);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       alpha2);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 Failed");
+            return std::move(m_operation);
+        }
+
+        if (m_operation.pointwise_port_count >= 3 && !m_operation.is_pointwise_activation_bwd_op) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_POINTWISE_BDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.bdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_BDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.pointwise_port_count == 4) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_POINTWISE_TDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.tdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_TDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_conv_backward_data() {
+        m_operation.operationTag = "ConvBwdData";
+
+        auto status = CUDNN_STATUS_SUCCESS;
+
+        auto dxdesc_ = m_operation.dxdesc != nullptr ? m_operation.dxdesc : m_operation.xdesc;
+        status       = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(dxdesc_->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.wdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W Failed");
+            return std::move(m_operation);
+        }
+
+        auto dydesc_ = m_operation.dydesc != nullptr ? m_operation.dydesc : m_operation.ydesc;
+        status       = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(dydesc_->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.cdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC Failed");
+            return std::move(m_operation);
+        }
+
+        void *alpha = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.alpha_s)
+                                                                     : static_cast<void *>(&m_operation.alpha_d));
+        void *beta  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.beta_s)
+                                                                     : static_cast<void *>(&m_operation.beta_d));
+        status      = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       alpha);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       beta);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL("Extracting the feature vector");
+        extract_feature_vector(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_bn_finalize_op() {
+        m_operation.operationTag = "BNFinalize";
+        auto status              = CUDNN_STATUS_SUCCESS;
+
+        auto set_attribute = [&status](Operation_v8 &operation,
+                                       cudnnBackendAttributeName_t attr,
+                                       const char *fail_msg,
+                                       void const *ptr,
+                                       cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       int64_t cnt                      = 1) {
+            status = detail::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&operation, status, fail_msg);
+            }
+        };
+
+        set_attribute(m_operation,
+                      CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE,
+                      "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE Failed",
+                      &(m_operation.bn_stats_mode),
+                      CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
+                      1);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        set_attribute(m_operation,
+                      CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC,
+                      "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC Failed",
+                      &(m_operation.compute_type),
+                      CUDNN_TYPE_DATA_TYPE,
+                      1);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.sumdesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC Failed",
+                          &(m_operation.sumdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.sqsumdesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC Failed",
+                          &(m_operation.sqsumdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.biasdesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC Failed",
+                          &(m_operation.biasdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.scaledesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC Failed",
+                          &(m_operation.scaledesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.eqscaledesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC Failed",
+                          &(m_operation.eqscaledesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.eqbiasdesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC Failed",
+                          &(m_operation.eqbiasdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.prevMeandesc) {
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC Failed",
+                &(m_operation.prevMeandesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.prevVardesc) {
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC Failed",
+                &(m_operation.prevVardesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.nextMeandesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute "
+                          "CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC Failed",
+                          &(m_operation.nextMeandesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.nextVardesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute "
+                          "CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC Failed",
+                          &(m_operation.nextVardesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.savedMeandesc) {
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC Failed",
+                &(m_operation.savedMeandesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.savedInVardesc) {
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC Failed",
+                &(m_operation.savedInVardesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.epsilondesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC Failed",
+                          &(m_operation.epsilondesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.expDecayFactordesc) {
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC Failed",
+                &(m_operation.expDecayFactordesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        if (m_operation.accumCountdesc) {
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC Failed",
+                &(m_operation.accumCountdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                return std::move(m_operation);
+            }
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_genstats_op() {
+        m_operation.operationTag = "GenStats";
+        auto status              = CUDNN_STATUS_SUCCESS;
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_GENSTATS_XDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_GENSTATS_XDESC Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.sumdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.sqsumdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_GENSTATS_MODE,
+                                       CUDNN_TYPE_GENSTATS_MODE,
+                                       1,
+                                       &(m_operation.genstats_mode));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_GENSTATS_MODE Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &(m_operation.compute_type));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_conv_backward_filter() {
+        m_operation.operationTag = "ConvBwdFilter";
+
+        auto status = CUDNN_STATUS_SUCCESS;
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X Failed");
+            return std::move(m_operation);
+        }
+
+        auto dwdesc_ = m_operation.dwdesc != nullptr ? m_operation.dwdesc : m_operation.wdesc;
+        status       = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(dwdesc_->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW Failed");
+            return std::move(m_operation);
+        }
+
+        auto dydesc_ = m_operation.dydesc != nullptr ? m_operation.dydesc : m_operation.ydesc;
+        status       = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(dydesc_->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.cdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation,
+                                          status,
+                                          "CUDNN_BACKEND_OPERATION: SetAttribute "
+                                          "CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC Failed");
+            return std::move(m_operation);
+        }
+        void *alpha = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.alpha_s)
+                                                                     : static_cast<void *>(&m_operation.alpha_d));
+        void *beta  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.beta_s)
+                                                                     : static_cast<void *>(&m_operation.beta_d));
+        status      = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       alpha);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       beta);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA Failed");
+            return std::move(m_operation);
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL("Extracting the feature vector");
+        extract_feature_vector(DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_norm_forward() {
+        m_operation.operationTag = "Norm_Fwd";
+        auto status              = CUDNN_STATUS_SUCCESS;
+
+        auto set_attribute = [&status](Operation_v8 &operation,
+                                       cudnnBackendAttributeName_t attr,
+                                       const char *fail_msg,
+                                       void const *ptr,
+                                       cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       int64_t cnt                      = 1) {
+            status = detail::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&operation, status, fail_msg);
+            }
+        };
+
+        cudnnBackendNormMode_t cudnn_norm_mode;
+        status = detail::convert_to_cudnn_type(m_operation.norm_mode, cudnn_norm_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_MODE Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                       CUDNN_TYPE_NORM_MODE,
+                                       1,
+                                       &cudnn_norm_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_MODE Failed");
+            return std::move(m_operation);
+        }
+
+        cudnnBackendNormFwdPhase_t cudnn_norm_fwd_phase;
+        status = detail::convert_to_cudnn_type(m_operation.norm_fwd_phase, cudnn_norm_fwd_phase);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_PHASE Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                       CUDNN_TYPE_NORM_FWD_PHASE,
+                                       1,
+                                       &cudnn_norm_fwd_phase);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_PHASE Failed");
+            return std::move(m_operation);
+        }
+
+        set_attribute(m_operation,
+                      CUDNN_ATTR_OPERATION_NORM_FWD_XDESC,
+                      "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_XDESC Failed",
+                      &m_operation.xdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.savedMeandesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC Failed",
+                          &m_operation.savedMeandesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.savedInVardesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC Failed",
+                &m_operation.savedInVardesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.scaledesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC Failed",
+                          &m_operation.scaledesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.biasdesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC Failed",
+                          &m_operation.biasdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.epsilondesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON Failed",
+                          &m_operation.epsilondesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.expDecayFactordesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR Failed",
+                          &m_operation.expDecayFactordesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.prevMeandesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC Failed",
+                &m_operation.prevMeandesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.prevVardesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC Failed",
+                &m_operation.prevVardesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.nextMeandesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC Failed",
+                &m_operation.nextMeandesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.nextVardesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC Failed",
+                &m_operation.nextVardesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.ydesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_FWD_YDESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNCUDNN_ATTR_OPERATION_NORM_FWD_YDESC Failed",
+                          &m_operation.ydesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.peerStatdescs.size()) {
+            std::vector<cudnnBackendDescriptor_t> backend_peer_stat_descs;
+            for (auto &desc : m_operation.peerStatdescs) {
+                backend_peer_stat_descs.push_back(desc->get_backend_descriptor());
+            }
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNCUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS Failed",
+                backend_peer_stat_descs.data(),
+                CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                backend_peer_stat_descs.size());
+        }
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_norm_backward() {
+        m_operation.operationTag = "Norm_Bwd";
+        auto status              = CUDNN_STATUS_SUCCESS;
+
+        auto set_attribute = [&status](Operation_v8 &operation,
+                                       cudnnBackendAttributeName_t attr,
+                                       const char *fail_msg,
+                                       void const *ptr,
+                                       cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       int64_t cnt                      = 1) {
+            status = detail::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&operation, status, fail_msg);
+            }
+        };
+        cudnnBackendNormMode_t cudnn_norm_mode;
+        status = detail::convert_to_cudnn_type(m_operation.norm_mode, cudnn_norm_mode);
+        set_attribute(m_operation,
+                      CUDNN_ATTR_OPERATION_NORM_BWD_MODE,
+                      "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_MODE Failed",
+                      &cudnn_norm_mode,
+                      CUDNN_TYPE_NORM_MODE);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.xdesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_XDESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_XDESC Failed",
+                          &m_operation.xdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.savedMeandesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC Failed",
+                          &m_operation.savedMeandesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.savedInVardesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC Failed",
+                &m_operation.savedInVardesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.dydesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC Failed",
+                          &m_operation.dydesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.scaledesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC Failed",
+                          &m_operation.scaledesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.dxdesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC Failed",
+                          &m_operation.dxdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.dscaledesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC Failed",
+                          &m_operation.dscaledesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.dbiasdesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC Failed",
+                          &m_operation.dbiasdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.peerStatdescs.size()) {
+            std::vector<cudnnBackendDescriptor_t> backend_peer_stat_descs;
+            for (auto &desc : m_operation.peerStatdescs) {
+                backend_peer_stat_descs.push_back(desc->get_backend_descriptor());
+            }
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNCUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS Failed",
+                backend_peer_stat_descs.data(),
+                CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                backend_peer_stat_descs.size());
+        }
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        if (m_operation.epsilondesc) {
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON Failed",
+                          &m_operation.epsilondesc->get_backend_descriptor());
+        }
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_resample_fwd_operation() {
+        m_operation.operationTag = "Resample_fwd";
+        auto status              = CUDNN_STATUS_SUCCESS;
+        status                   = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.ydesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_operation.alpha_d));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_operation.beta_d));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.resampledesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC Failed");
+            return std::move(m_operation);
+        }
+
+        // Maxpooling forward
+        if (m_operation.idxdesc != nullptr) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.idxdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_resample_bwd_operation() {
+#if (CUDNN_VERSION >= 8600)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8600, m_operation, "CUDNN_BACKEND_OPERATION: Resample_bwd requires cudnn 8.6.0");
+        m_operation.operationTag = "Resample_bwd";
+        auto status              = CUDNN_STATUS_SUCCESS;
+        status                   = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.dxdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC Failed");
+            return std::move(m_operation);
+        }
+#if (CUDNN_VERSION >= 8700)
+        if (m_operation.xdesc != nullptr) {
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8700,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC requires cudnn 8.7.0");
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.xdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+        if (m_operation.ydesc != nullptr) {
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8700,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC requires cudnn 8.7.0");
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.ydesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+#endif
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.dydesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_operation.alpha_d));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_operation.beta_d));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.resampledesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC Failed");
+            return std::move(m_operation);
+        }
+
+        // Maxpooling backward
+        if (m_operation.idxdesc != nullptr) {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.idxdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC Failed");
+                return std::move(m_operation);
+            }
+        }
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+#else
+        set_error_and_throw_exception(&m_operation,
+                                      CUDNN_STATUS_NOT_SUPPORTED,
+                                      "CUDNN_BACKEND_OPERATION: Resample operation Not supported in this version");
+#endif
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_rng_operation() {
+#if (CUDNN_VERSION >= 8700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_operation, "CUDNN_BACKEND_OPERATION: build_rng_operation requires cudnn 8.7.0");
+        m_operation.operationTag = "Rng";
+        auto status              = CUDNN_STATUS_SUCCESS;
+        status                   = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RNG_YDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.ydesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_YDESC Failed");
+            return std::move(m_operation);
+        }
+
+#if (CUDNN_VERSION >= 8800)
+        // seed can be a tensor or an int64
+        // if tensor is defined we give it precedence
+        if (m_operation.seeddesc) {
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8800,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_SEED requires cudnn 8.8.0");
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RNG_SEED,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.seeddesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_SEED Failed");
+                return std::move(m_operation);
+            }
+        } else
+#endif
+        {
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RNG_SEED,
+                                           CUDNN_TYPE_INT64,
+                                           1,
+                                           &(m_operation.seed));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_SEED Failed");
+                return std::move(m_operation);
+            }
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RNG_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.rngdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_DESC Failed");
+            return std::move(m_operation);
+        }
+
+#if (CUDNN_VERSION >= 8800)
+        if (m_operation.offsetdesc) {
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8800,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC requires cudnn 8.8.0");
+            status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           1,
+                                           &(m_operation.offsetdesc->get_backend_descriptor()));
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operation,
+                    status,
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC Failed");
+                return std::move(m_operation);
+            }
+        }
+#endif
+
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+#else
+        set_error_and_throw_exception(&m_operation,
+                                      CUDNN_STATUS_NOT_SUPPORTED,
+                                      "CUDNN_BACKEND_OPERATION: Rng operation Not supported in this version");
+#endif
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_paged_cache_load_op() {
+#if (CUDNN_VERSION < 90500)
+        set_error_and_throw_exception(
+            &m_operation,
+            CUDNN_STATUS_NOT_SUPPORTED,
+            "CUDNN_BACKEND_OPERATION: paged_cache_load_op operation Not supported in this version");
+#else
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            90500, m_operation, "CUDNN_BACKEND_OPERATION: build_paged_cache_load_op requires cudnn 9.5.0");
+
+        // Quick helper lambda to ensure code being DRY
+        auto set_tensor_descriptor = [&](auto attr, const std::string &descriptor_name, auto &descriptor) {
+            std::string error_msg = "CUDNN_BACKEND_OPERATION: Check and Set " + descriptor_name;
+            auto status           = CUDNN_STATUS_SUCCESS;
+            if (descriptor != nullptr) {
+                status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               attr,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(descriptor->get_backend_descriptor()));
+            } else {
+                status = CUDNN_STATUS_BAD_PARAM;
+            }
+
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_operation, status, error_msg.c_str());
+            }
+            return status;
+        };
+
+        if (CUDNN_STATUS_SUCCESS != set_tensor_descriptor(CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC,
+                                                          "CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC",
+                                                          m_operation.containerdesc)) {
+            return std::move(m_operation);
+        }
+
+        if (CUDNN_STATUS_SUCCESS != set_tensor_descriptor(CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC,
+                                                          "CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC",
+                                                          m_operation.pageTabledesc)) {
+            return std::move(m_operation);
+        }
+
+        if (CUDNN_STATUS_SUCCESS != set_tensor_descriptor(CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC,
+                                                          "CUDNN_ATTR_OPERATION_PAGED_CACHE_SEQUENCE_DESC",
+                                                          m_operation.sequencedesc)) {
+            return std::move(m_operation);
+        }
+
+        if (CUDNN_STATUS_SUCCESS != set_tensor_descriptor(CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC,
+                                                          "CUDNN_ATTR_OPERATION_PAGED_CACHE_YDESC",
+                                                          m_operation.ydesc)) {
+            return std::move(m_operation);
+        }
+
+        auto status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+#endif
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_reshape_operation() {
+#if (CUDNN_VERSION >= 8700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_operation, "CUDNN_BACKEND_OPERATION: build_reshape_operation requires cudnn 8.7.0");
+        m_operation.operationTag = "Reshape";
+        auto status              = CUDNN_STATUS_SUCCESS;
+        status                   = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESHAPE_XDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESHAPE_XDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_RESHAPE_YDESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.ydesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESHAPE_YDESC Failed");
+            return std::move(m_operation);
+        }
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+#else
+        set_error_and_throw_exception(&m_operation,
+                                      CUDNN_STATUS_NOT_SUPPORTED,
+                                      "CUDNN_BACKEND_OPERATION: Reshape operation Not supported in this version");
+#endif
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_bn_bwd_weight_op() {
+        m_operation.operationTag = "Dgrad_Drelu_BN_Bwd";
+        auto status              = CUDNN_STATUS_SUCCESS;
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &(m_operation.compute_type));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC Failed");
+            return std::move(m_operation);
+        }
+
+        auto set_attribute = [&status](Operation_v8 &operation,
+                                       cudnnBackendAttributeName_t attr,
+                                       const char *fail_msg,
+                                       void const *ptr,
+                                       cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       int64_t cnt                      = 1) {
+            status = detail::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&operation, status, fail_msg);
+            }
+        };
+
+        if (m_operation.xdesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC Failed",
+                          &m_operation.xdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.savedMeandesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC Failed",
+                          &m_operation.savedMeandesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.savedInVardesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC Failed",
+                &m_operation.savedInVardesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.scaledesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC Failed",
+                &m_operation.scaledesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.dydesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC Failed",
+                          &m_operation.dydesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.dscaledesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC Failed",
+                &m_operation.dscaledesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.dbiasdesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC Failed",
+                &m_operation.dbiasdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.eqscaledesc)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC Failed",
+                &m_operation.eqscaledesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.eqscaledesc1)
+            set_attribute(
+                m_operation,
+                CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC Failed",
+                &m_operation.eqscaledesc1->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+
+        if (m_operation.eqbiasdesc)
+            set_attribute(m_operation,
+                          CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS,
+                          "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS Failed",
+                          &m_operation.eqbiasdesc->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return std::move(m_operation);
+        }
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+        return std::move(m_operation);
+    }
+
+    Operation_v8 &&
+    build_conv_forward() {
+        m_operation.operationTag = "ConvFwd";
+
+        auto status = CUDNN_STATUS_SUCCESS;
+
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.xdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.wdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.ydesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       1,
+                                       &(m_operation.cdesc->get_backend_descriptor()));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC Failed");
+            return std::move(m_operation);
+        }
+        void *alpha = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.alpha_s)
+                                                                     : static_cast<void *>(&m_operation.alpha_d));
+        void *beta  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.beta_s)
+                                                                     : static_cast<void *>(&m_operation.beta_d));
+        status      = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       alpha);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA,
+                                       m_operation.alphabetaType,
+                                       1,
+                                       beta);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation,
+                status,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA Failed");
+            return std::move(m_operation);
+        }
+        status = detail::finalize(m_operation.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
+            return std::move(m_operation);
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL("Extracting the feature vector");
+        extract_feature_vector(DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR);
+        return std::move(m_operation);
+    }
+
+    void
+    extract_feature_vector(DescriptorType_t op_type) {
+        /// Build the feature vector of this operation now.
+        m_operation.feature_vector.reserve(50);
+
+        m_operation.feature_vector.push_back(static_cast<int>(op_type));
+        for (auto i = 0; i < tensor_dims; i++) {
+            m_operation.feature_vector.push_back(xTensor_dimA[i]);  // n, c, (g), d, h , w
+        }
+        for (auto i = 0; i < tensor_dims; i++) {
+            m_operation.feature_vector.push_back(wTensor_dimA[i]);  // n, c, (g), d, h , w
+        }
+        for (auto i = 0; i < tensor_dims; i++) {
+            m_operation.feature_vector.push_back(yTensor_dimA[i]);  // n, c, (g), d, h , w
+        }
+        const int max_spatial_dim = 3;
+
+        /// Padding
+        for (auto i = 0; i < max_spatial_dim; i++) {
+            if (i == max_spatial_dim - 1 && is2D) {
+                m_operation.feature_vector.push_back(0);
+            } else {
+                m_operation.feature_vector.push_back(conv_padding[i]);
+            }
+        }
+        /// Dilation
+        for (auto i = 0; i < max_spatial_dim; i++) {
+            if (i == max_spatial_dim - 1 && is2D) {
+                m_operation.feature_vector.push_back(0);
+            } else {
+                m_operation.feature_vector.push_back(conv_dilation[i]);
+            }
+        }
+        /// Strides
+        for (auto i = 0; i < max_spatial_dim; i++) {
+            if (i == max_spatial_dim - 1 && is2D) {
+                m_operation.feature_vector.push_back(0);
+            } else {
+                m_operation.feature_vector.push_back(conv_stride[i]);
+            }
+        }
+
+        m_operation.feature_vector.push_back(xType);
+        m_operation.feature_vector.push_back(wType);
+        m_operation.feature_vector.push_back(yType);
+        m_operation.feature_vector.push_back(cType);
+        m_operation.feature_vector.push_back(mode);
+
+        for (auto i = 0; i < tensor_dims; i++) {
+            m_operation.feature_vector.push_back(xTensor_strA[i]);  // n, c, (g), d, h , w
+        }
+        for (auto i = 0; i < tensor_dims; i++) {
+            m_operation.feature_vector.push_back(wTensor_strA[i]);  // n, c, (g), d, h , w
+        }
+        for (auto i = 0; i < tensor_dims; i++) {
+            m_operation.feature_vector.push_back(yTensor_strA[i]);  // n, c, (g), d, h , w
+        }
+
+        int64_t alpha_as_int;
+        int64_t beta_as_int;
+        std::memcpy((void *)&alpha_as_int, (void *)(&m_operation.alpha_s), sizeof(int64_t));
+        std::memcpy((void *)&beta_as_int, (void *)(&m_operation.beta_s), sizeof(int64_t));
+
+        m_operation.feature_vector.push_back(alpha_as_int);
+        m_operation.feature_vector.push_back(beta_as_int);
+    }
+
+    cudnnStatus_t
+    validate_matmul_op(Message_t &msg) {
+        if (m_operation.matmuldesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_MATMUL_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.amatdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_MATMUL_ADESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.bmatdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_MATMUL_BDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.cmatdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_MATMUL_CDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_norm_op(Message_t &msg) {
+        cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+        if (m_operation.xdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_NORM.*XDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        if (detail::get_backend_version() == 8500) {
+            std::array<int64_t, 10> x_dimensions;
+            int64_t dim_count;
+            status = detail::get_attribute(m_operation.xdesc->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_DIMENSIONS,
+                                           CUDNN_TYPE_INT64,
+                                           x_dimensions.size(),
+                                           &dim_count,
+                                           x_dimensions.data());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                msg = "CUDNN_BACKEND_OPERATION: CUDNN_BACKEND_TENSOR has invalid CUDNN_ATTR_TENSOR_DIMENSIONS";
+                return status;
+            }
+
+            int64_t N = x_dimensions[0];
+            int64_t C = x_dimensions[1];
+
+            if ((N != 1) || ((C % 8) != 0)) {
+                msg = "CUDNN_BACKEND_OPERATION: CUDNN_BACKEND_TENSOR has bad CUDNN_ATTR_TENSOR_DIMENSIONS";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        }
+
+        return status;
+    }
+
+    cudnnStatus_t
+    validate_resample_op(Message_t &msg) {
+        if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR) {
+            if (m_operation.xdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*XDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.ydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*YDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR) {
+            if (m_operation.dxdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*DXDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.dydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*DYDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        }
+
+        if (m_operation.resampledesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*RESAMPLEDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_rng_op(Message_t &msg) {
+        if (m_operation.ydesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_REDUCTION_YDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        if (m_operation.rngdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RNG_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_reshape_op(Message_t &msg) {
+        if (m_operation.xdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESHAPE_XDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        if (m_operation.ydesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESHAPE_YDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_bn_bwd_weight_op(Message_t &msg) {
+        if (m_operation.xdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        if (m_operation.dydesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        if (m_operation.savedMeandesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        if (m_operation.savedInVardesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_reduction_op(Message_t &msg) {
+        if (m_operation.reductiondesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_REDUCTION_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.xdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_REDUCTION_XDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.ydesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_REDUCTION_YDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_pointwise_op(Message_t &msg) {
+        if (m_operation.xdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_POINTWISE_XDESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.is_pointwise_math_op) {
+            if (m_operation.pointwise_port_count == 3 && m_operation.bdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_POINTWISE_BDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.ydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_POINTWISE_YDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        } else if (m_operation.is_pointwise_activation_fwd_op || m_operation.is_pointwise_identity_op) {
+            if (m_operation.ydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_POINTWISE_YDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        } else if (m_operation.is_pointwise_activation_bwd_op) {
+            if (m_operation.dydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_POINTWISE_DYDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.dxdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_POINTWISE_DXDESC";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        } else {
+            msg = "CUDNN_BACKEND_OPERATION: Unsupported cudnn pointwise mode. Check PointwiseMode_t::*";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    cudnnStatus_t
+    validate_convolution_op(Message_t &msg) {
+        if (m_operation.cdesc == nullptr) {
+            msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_CONVOLUTION_*_CONV_DESC";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        if (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) {
+            if (m_operation.xdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_CONVOLUTION_*_X";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.wdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_CONVOLUTION_*_W";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.ydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_CONVOLUTION_*_Y";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) {
+            if (m_operation.ydesc != nullptr && m_operation.dydesc != nullptr) {
+                msg =
+                    "CUDNN_BACKEND_OPERATION: Ambiguous specification. Choose and Set only one of setyDesc() or "
+                    "setdyDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.ydesc == nullptr && m_operation.dydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Choose and Set one of setyDesc() or setdyDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.xdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_CONVOLUTION_*_X";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.wdesc != nullptr && m_operation.dwdesc != nullptr) {
+                msg =
+                    "CUDNN_BACKEND_OPERATION: Ambiguous specification. Choose and Set only one of setwDesc() or "
+                    "setdwDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.wdesc == nullptr && m_operation.dwdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Choose and Set one of setwDesc() or setdwDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+            if (m_operation.ydesc != nullptr && m_operation.dydesc != nullptr) {
+                msg =
+                    "CUDNN_BACKEND_OPERATION: Ambiguous specification. Choose and Set only one of setyDesc() or "
+                    "setdyDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.ydesc == nullptr && m_operation.dydesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Choose and Set one of setyDesc() or setdyDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.wdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_CONVOLUTION_*_W";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.xdesc != nullptr && m_operation.dxdesc != nullptr) {
+                msg =
+                    "CUDNN_BACKEND_OPERATION: Ambiguous specification. Choose and Set only one of setxDesc() or "
+                    "setdxDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+            if (m_operation.xdesc == nullptr && m_operation.dxdesc == nullptr) {
+                msg = "CUDNN_BACKEND_OPERATION: Choose and Set one of setxDesc() or setdxDesc()";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
+        } else {
+            msg =
+                "CUDNN_BACKEND_OPERATION: Unsupported convolution operation. Check and set "
+                "CUDNN_BACKEND_OPERATION_CONVOLUTION_*_DESCRIPTOR";
+            return CUDNN_STATUS_BAD_PARAM;
+        }
+        return CUDNN_STATUS_SUCCESS;
+    }
+
+    void
+    copy_dims_and_strides(const int64_t *from, int64_t *to) const {
+        for (auto i = 0; i < CUDNN_DIM_MAX + 1; i++) {
+            to[i] = from[i];
+        }
+    }
+
+   public:
+    /** @defgroup OperationBuilder_v8
+     *  Set individual property of Operation_v8 class
+     *  @{
+     */
+    /// Will be Deprecated Do not use
+    auto
+    setxDesc(ManagedOpaqueDescriptor const &raw_tensor) -> OperationBuilder_v8 & {
+        m_operation.xdesc = raw_tensor;
+        return *this;
+    }
+
+    auto
+    setxDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.xdesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), xTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), xTensor_strA);
+        tensor_dims = tensor.getDimensionCount();
+        xType       = tensor.getDataType();
+        return *this;
+    }
+    auto
+    setbDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_pointwise_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Pointwise operation does not need bTensor");
+        }
+        m_operation.bdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    settDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_pointwise_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Pointwise operation does not need tTensor");
+        }
+        m_operation.tdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setyDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.ydesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), yTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), yTensor_strA);
+        yType = tensor.getDataType();
+        return *this;
+    }
+    auto
+    setwDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_convolution_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Convolution operation does not need wTensor");
+        }
+        m_operation.wdesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), wTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), wTensor_strA);
+        wType = tensor.getDataType();
+        return *this;
+    }
+
+    /// Will be Deprecated Do not use
+    auto
+    setdyDesc(ManagedOpaqueDescriptor const &raw_tensor) -> OperationBuilder_v8 & {
+        m_operation.dydesc = raw_tensor;
+        return *this;
+    }
+    auto
+    setdyDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.dydesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), yTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), yTensor_strA);
+        yType = tensor.getDataType();
+        return *this;
+    }
+    auto
+    setdxDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.dxdesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), xTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), xTensor_strA);
+        tensor_dims = tensor.getDimensionCount();
+        xType       = tensor.getDataType();
+        return *this;
+    }
+    auto
+    setdwDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.dwdesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), wTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), wTensor_strA);
+        wType = tensor.getDataType();
+        return *this;
+    }
+    auto
+    setResampleDesc(ResampleDesc_v8 const &resampleDesc) -> OperationBuilder_v8 & {
+        if (is_resample_fwd_op == false && is_resample_bwd_op == false) {
+            set_error_and_throw_exception(&m_operation,
+                                          CUDNN_STATUS_BAD_PARAM,
+                                          "RESAMPLE_DESC: Non Resample operation does not need Resample DESCRIPTOR");
+        }
+        m_operation.resampledesc = resampleDesc.get_desc();
+        return *this;
+    }
+
+    auto
+    setRngDesc(RngDesc_v8 const &rngDesc) -> OperationBuilder_v8 & {
+        if (is_rng_op == false) {
+            set_error_and_throw_exception(
+                &m_operation, CUDNN_STATUS_BAD_PARAM, "RNG_DESC: Non Rng operation does not need Rng DESCRIPTOR");
+        }
+        m_operation.rngdesc = rngDesc.get_desc();
+        return *this;
+    }
+
+    auto
+    setidxDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.idxdesc = tensor.get_desc();
+        copy_dims_and_strides(tensor.getDimArray(), idxTensor_dimA);
+        copy_dims_and_strides(tensor.getStrideArray(), idxTensor_strA);
+        idxType = tensor.getDataType();
+        return *this;
+    }
+
+    auto
+    setSeedDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.seeddesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setOffsetDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.offsetdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setcDesc(ConvDesc_v8 const &conv) -> OperationBuilder_v8 & {
+        if (is_convolution_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Convolution operation does not need Convolution DESCRIPTOR");
+        }
+        m_operation.cdesc = conv.get_desc();
+        if (conv.getComputePrecision() == DataType_t::DOUBLE) {
+            m_operation.alphabetaType = CUDNN_TYPE_DOUBLE;
+        }
+        is2D = conv.getDimensionCount() == 2;
+        copy_dims_and_strides(conv.getPadding(), conv_padding);
+        copy_dims_and_strides(conv.getDilation(), conv_dilation);
+        copy_dims_and_strides(conv.getStride(), conv_stride);
+        cType = static_cast<int>(conv.getComputePrecision());
+        mode  = conv.getMathMode();
+        return *this;
+    }
+
+    auto
+    setcontainerDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.containerdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setpageTableDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.pageTabledesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setsequenceDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.sequencedesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setNormFwdPhase(NormFwdPhase_t mode) -> OperationBuilder_v8 & {
+        m_operation.norm_fwd_phase = mode;
+        return *this;
+    }
+
+    auto
+    setNormalizationMode(NormMode_t mode) -> OperationBuilder_v8 & {
+        m_operation.norm_mode = mode;
+        return *this;
+    }
+
+    // To be deprecated. Please use setNormalizationMode(cudnn_frontend::NormMode_t mode) instead.
+    auto
+    setNormalizationMode(cudnnBackendNormMode_t mode) -> OperationBuilder_v8 & {
+        detail::convert_from_cudnn_type(mode, m_operation.norm_mode);
+        return *this;
+    }
+
+    // To be deprecated. Please use setNormFwdPhase(cudnn_frontend::NormFwdPhase_t mode) instead.
+    auto
+    setNormFwdPhase(cudnnBackendNormFwdPhase_t mode) -> OperationBuilder_v8 & {
+        detail::convert_from_cudnn_type(mode, m_operation.norm_fwd_phase);
+        return *this;
+    }
+
+    auto
+    setBNFinalizeMode(cudnnBnFinalizeStatsMode_t mode) -> OperationBuilder_v8 & {
+        m_operation.bn_stats_mode = mode;
+        return *this;
+    }
+
+    auto
+    setAccumCountTensor(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.accumCountdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setEpsilonTensor(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.epsilondesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setExpDecayFactorTensor(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.expDecayFactordesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    addPeerStatTensor(Tensor_v8 const &peer_stat_tensor) -> OperationBuilder_v8 & {
+        m_operation.peerStatdescs.push_back(peer_stat_tensor.get_desc());
+        return *this;
+    }
+
+    auto
+    setPeerStatTensor(std::vector<Tensor_v8> const &peer_stat_tensors) -> OperationBuilder_v8 & {
+        for (auto &tensor : peer_stat_tensors) {
+            m_operation.peerStatdescs.push_back(tensor.get_desc());
+        }
+        return *this;
+    }
+
+    auto
+    setPrevRunningMeanAndVar(Tensor_v8 const &mean, Tensor_v8 const &var) -> OperationBuilder_v8 & {
+        m_operation.prevMeandesc = mean.get_desc();
+        m_operation.prevVardesc  = var.get_desc();
+        return *this;
+    }
+
+    auto
+    setNextRunningMeanAndVar(Tensor_v8 const &mean, Tensor_v8 const &var) -> OperationBuilder_v8 & {
+        m_operation.nextMeandesc = mean.get_desc();
+        m_operation.nextVardesc  = var.get_desc();
+        return *this;
+    }
+
+    auto
+    setSavedMeanAndInvVar(Tensor_v8 const &mean, Tensor_v8 const &var) -> OperationBuilder_v8 & {
+        m_operation.savedMeandesc  = mean.get_desc();
+        m_operation.savedInVardesc = var.get_desc();
+        return *this;
+    }
+
+    auto
+    setSavedInvVar(Tensor_v8 const &var) -> OperationBuilder_v8 & {
+        m_operation.savedInVardesc = var.get_desc();
+        return *this;
+    }
+
+    auto
+    setScale(Tensor_v8 const &scale_tensor) -> OperationBuilder_v8 & {
+        m_operation.scaledesc = scale_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setBias(Tensor_v8 const &bias_tensor) -> OperationBuilder_v8 & {
+        m_operation.biasdesc = bias_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setScaleAndBias(Tensor_v8 const &scale_tensor, Tensor_v8 const &bias_tensor) -> OperationBuilder_v8 & {
+        m_operation.scaledesc = scale_tensor.get_desc();
+        m_operation.biasdesc  = bias_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setDScale(Tensor_v8 const &scale_tensor) -> OperationBuilder_v8 & {
+        m_operation.dscaledesc = scale_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setDBias(Tensor_v8 const &bias_tensor) -> OperationBuilder_v8 & {
+        m_operation.dbiasdesc = bias_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setDScaleAndDBias(Tensor_v8 const &scale_tensor, Tensor_v8 const &bias_tensor) -> OperationBuilder_v8 & {
+        m_operation.dscaledesc = scale_tensor.get_desc();
+        m_operation.dbiasdesc  = bias_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setEqScalesAndBias(Tensor_v8 const &eq_scale_tensor1,
+                       Tensor_v8 const &eq_scale_tensor2,
+                       Tensor_v8 const &eq_bias_tensor) -> OperationBuilder_v8 & {
+        m_operation.eqscaledesc  = eq_scale_tensor1.get_desc();
+        m_operation.eqscaledesc1 = eq_scale_tensor2.get_desc();
+        m_operation.eqbiasdesc   = eq_bias_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setEqScaleAndBias(Tensor_v8 const &eq_scale_tensor, Tensor_v8 const &eq_bias_tensor) -> OperationBuilder_v8 & {
+        m_operation.eqscaledesc = eq_scale_tensor.get_desc();
+        m_operation.eqbiasdesc  = eq_bias_tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setSumDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.sumdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setSqSumDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        m_operation.sqsumdesc = tensor.get_desc();
+        return *this;
+    }
+
+    auto
+    setaMatDesc(ManagedOpaqueDescriptor const &raw_tensor) -> OperationBuilder_v8 & {
+        m_operation.amatdesc = raw_tensor;
+        return *this;
+    }
+    auto
+    setaMatDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need a Matrix Tensor");
+        }
+        m_operation.amatdesc = tensor.get_desc();
+        return *this;
+    }
+    auto
+    setbMatDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need b Matrix Tensor");
+        }
+        m_operation.bmatdesc = tensor.get_desc();
+        return *this;
+    }
+    auto
+    setcMatDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need c Matrix Tensor");
+        }
+        m_operation.cmatdesc = tensor.get_desc();
+        return *this;
+    }
+    auto
+    setmOverrideDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need mOverride Tensor");
+        }
+        m_operation.moverridedesc = tensor.get_desc();
+        return *this;
+    }
+    auto
+    setnOverrideDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need nOverride Tensor");
+        }
+        m_operation.noverridedesc = tensor.get_desc();
+        return *this;
+    }
+    auto
+    setkOverrideDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need kOverride Tensor");
+        }
+        m_operation.koverridedesc = tensor.get_desc();
+        return *this;
+    }
+    auto
+    setmatmulDesc(MatMulDesc_v8 const &matmulDesc) -> OperationBuilder_v8 & {
+        if (is_matmul_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Matmul operation does not need MATMUL DESCRIPTOR");
+        }
+        m_operation.matmuldesc = matmulDesc.get_desc();
+        return *this;
+    }
+    auto
+    setreductionDesc(ReductionDesc_v8 const &reductionDesc) -> OperationBuilder_v8 & {
+        if (is_reduction_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Reduction operation does not need REDUCTION DESCRIPTOR");
+        }
+        m_operation.reductiondesc = reductionDesc.get_desc();
+        return *this;
+    }
+    auto
+    setpwDesc(PointWiseDesc_v8 const &pointWiseDesc) -> OperationBuilder_v8 & {
+        if (is_pointwise_op == false) {
+            set_error_and_throw_exception(
+                &m_operation,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Pointwise operation does not need POINTWISE DESCRIPTOR");
+        }
+        m_operation.pwdesc               = pointWiseDesc.get_desc();
+        m_operation.pointwise_port_count = pointWiseDesc.getPortCount();
+        m_operation.pointwise_mode       = pointWiseDesc.getPointWiseMode();
+
+        m_operation.is_pointwise_math_op = ((m_operation.pointwise_mode == PointwiseMode_t::ADD) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::MUL) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::DIV) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::SUB) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::ADD_SQUARE) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::RSQRT) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::SIN) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::COS) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::TAN) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::LOGICAL_OR) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::LOGICAL_AND) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::LOGICAL_NOT) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CMP_EQ) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CMP_NEQ) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CMP_GT) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CMP_GE) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CMP_LT) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CMP_LE) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::LOG) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::NEG) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::MOD) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::POW) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::ABS) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::CEIL) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::FLOOR) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::GEN_INDEX) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::BINARY_SELECT) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::ERF) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::RECIPROCAL) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::MIN) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::MAX) ||
+                                            (m_operation.pointwise_mode == PointwiseMode_t::SQRT));
+
+        m_operation.is_pointwise_identity_op = (m_operation.pointwise_mode == PointwiseMode_t::IDENTITY);
+
+        m_operation.is_pointwise_activation_fwd_op =
+            ((m_operation.pointwise_mode == PointwiseMode_t::RELU_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::TANH_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::SIGMOID_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::ELU_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::GELU_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::GELU_APPROX_TANH_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::SOFTPLUS_FWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::EXP) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::SWISH_FWD));
+
+        m_operation.is_pointwise_activation_bwd_op =
+            ((m_operation.pointwise_mode == PointwiseMode_t::RELU_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::TANH_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::SIGMOID_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::ELU_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::GELU_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::GELU_APPROX_TANH_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::SOFTPLUS_BWD) ||
+             (m_operation.pointwise_mode == PointwiseMode_t::SWISH_BWD));
+
+        return *this;
+    }
+
+    auto
+    setAlpha(float alpha) -> OperationBuilder_v8 & {
+        m_operation.alpha_d = static_cast<double>(alpha);
+        m_operation.alpha_s = alpha;
+        return *this;
+    }
+    auto
+    setAlpha(double alpha) -> OperationBuilder_v8 & {
+        m_operation.alpha_s = static_cast<float>(alpha);
+        m_operation.alpha_d = alpha;
+        return *this;
+    }
+    auto
+    setAlpha2(float alpha) -> OperationBuilder_v8 & {
+        m_operation.alpha2_d = static_cast<double>(alpha);
+        m_operation.alpha2_s = alpha;
+        return *this;
+    }
+    auto
+    setAlpha2(double alpha) -> OperationBuilder_v8 & {
+        m_operation.alpha2_s = static_cast<float>(alpha);
+        m_operation.alpha2_d = alpha;
+        return *this;
+    }
+    auto
+    setBeta(float beta) -> OperationBuilder_v8 & {
+        m_operation.beta_d = static_cast<double>(beta);
+        m_operation.beta_s = beta;
+        return *this;
+    }
+    auto
+    setBeta(double beta) -> OperationBuilder_v8 & {
+        m_operation.beta_s = static_cast<float>(beta);
+        m_operation.beta_d = beta;
+        return *this;
+    }
+
+    auto
+    setSeed(int64_t seed) -> OperationBuilder_v8 & {
+        m_operation.seed = seed;
+        return *this;
+    }
+
+    auto
+    setComputeType(cudnnDataType_t dtype) -> OperationBuilder_v8 & {
+        m_operation.compute_type = dtype;
+        return *this;
+    }
+
+    auto
+    setMathPrecision(cudnnDataType_t dtype) -> OperationBuilder_v8 & {
+        return setComputeType(dtype);
+    }
+
+    auto
+    setGenStatsMode(cudnnGenStatsMode_t type) -> OperationBuilder_v8 & {
+        m_operation.genstats_mode = type;
+        return *this;
+    }
+
+    OperationBuilder_v8(DescriptorType_t mode) {
+        m_operation.op_mode = mode;
+        is_convolution_op =
+            ((m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) ||
+             (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) ||
+             (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR));
+
+        is_pointwise_op        = (m_operation.op_mode == DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR);
+        is_matmul_op           = (m_operation.op_mode == DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR);
+        is_reduction_op        = (m_operation.op_mode == DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR);
+        is_genstats_op         = (m_operation.op_mode == DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR);
+        is_bn_finalize_op      = (m_operation.op_mode == DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR);
+        is_bn_bwd_weight       = (m_operation.op_mode == DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR);
+        is_resample_fwd_op     = (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR);
+        is_norm_forward_op     = (m_operation.op_mode == DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR);
+        is_norm_backward_op    = (m_operation.op_mode == DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR);
+        is_resample_bwd_op     = (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR);
+        is_rng_op              = (m_operation.op_mode == DescriptorType_t::OPERATION_RNG_DESCRIPTOR);
+        is_reshape_op          = (m_operation.op_mode == DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR);
+        is_paged_cache_load_op = (m_operation.op_mode == DescriptorType_t::OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR);
+    }
+
+    // This constructor which takes in cudnn C backend enum for cudnnBackendDescriptorType_t will be deprecated,
+    // in favour of OperationBuilder_v8(cudnn_frontend::DescriptorType_t)
+    OperationBuilder_v8(cudnnBackendDescriptorType_t mode)
+        : OperationBuilder_v8(detail::convert_from_cudnn_type(mode)) {}
+
+    /** @} */
+
+    //! constructs the backend Operation_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    Operation_v8 &&
+    build() {
+        if (m_operation.status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operation, m_operation.status, "CUDNN_BACKEND_OPERATION: Operation not initialized properly");
+            return std::move(m_operation);
+        }
+
+        Message_t msg         = nullptr;
+        cudnnStatus_t status_ = CUDNN_STATUS_SUCCESS;
+        if (is_convolution_op) {
+            status_ = validate_convolution_op(msg);
+        } else if (is_pointwise_op) {
+            status_ = validate_pointwise_op(msg);
+        } else if (is_matmul_op) {
+            status_ = validate_matmul_op(msg);
+        } else if (is_reduction_op) {
+            status_ = validate_reduction_op(msg);
+        } else if (is_genstats_op) {
+            status_ = CUDNN_STATUS_SUCCESS;
+        } else if (is_bn_finalize_op) {
+            status_ = CUDNN_STATUS_SUCCESS;
+        } else if (is_bn_bwd_weight) {
+            status_ = validate_bn_bwd_weight_op(msg);
+        } else if (is_resample_fwd_op) {
+            status_ = validate_resample_op(msg);
+        } else if (is_resample_bwd_op) {
+            status_ = validate_resample_op(msg);
+        } else if (is_rng_op) {
+            status_ = validate_rng_op(msg);
+        } else if (is_norm_forward_op || is_norm_backward_op) {
+            status_ = validate_norm_op(msg);
+        } else if (is_reshape_op) {
+            status_ = validate_reshape_op(msg);
+        } else if (is_paged_cache_load_op) {
+            status_ = CUDNN_STATUS_SUCCESS;
+        } else {
+            status_ = CUDNN_STATUS_BAD_PARAM;
+            msg =
+                "CUDNN_BACKEND_OPERATION_DESCRIPTOR: Unsupported cudnn backend descriptor type. Check and set "
+                "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR";
+        }
+        if (status_ != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status_, msg);
+            return std::move(m_operation);
+        }
+
+        // Create the descriptor.
+        cudnnBackendDescriptorType_t cudnn_backend_descriptor_type;
+        auto status = detail::convert_to_cudnn_type(m_operation.op_mode, cudnn_backend_descriptor_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            std::stringstream ss;
+            ss << "CUDNN_BACKEND_OPERATION: unable to identify backend operation for " << m_operation.op_mode;
+            set_error_and_throw_exception(&m_operation, status, (ss.str()).c_str());
+            return std::move(m_operation);
+        }
+        status = m_operation.initialize_managed_backend_pointer(cudnn_backend_descriptor_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnCreate Failed");
+            return std::move(m_operation);
+        }
+
+        if (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) {
+            return build_conv_forward();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) {
+            return build_conv_backward_filter();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+            return build_conv_backward_data();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR) {
+            return build_pointwise_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR) {
+            return build_matmul_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR) {
+            return build_reduction_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR) {
+            return build_genstats_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR) {
+            return build_bn_finalize_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR) {
+            return build_bn_bwd_weight_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR) {
+            return build_resample_fwd_operation();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR) {
+            return build_norm_forward();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR) {
+            return build_norm_backward();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR) {
+            return build_resample_bwd_operation();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RNG_DESCRIPTOR) {
+            return build_rng_operation();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR) {
+            return build_paged_cache_load_op();
+        } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR) {
+            return build_reshape_operation();
+        } else {
+            set_error_and_throw_exception(
+                &m_operation, status, "CUDNN_BACKEND_OPERATION: unimplemented operation in frontend");
+        }
+        CUDNN_FE_LOG_LABEL_ENDL(m_operation);
+        return std::move(m_operation);
+    }
+};
+
+using Operation        = Operation_v8;
+using OperationBuilder = OperationBuilder_v8;
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_OperationGraph.h b/third_party/cudnn-frontend/include/cudnn_frontend_OperationGraph.h
new file mode 100644
index 00000000..0db58e7a
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_OperationGraph.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "cudnn_frontend_Operation.h"
+#include "cudnn_frontend_utils.h"
+// Compile time constant for max ops in a op graph
+constexpr int64_t MAX_OPGRAPH_OPS = 50;
+
+namespace cudnn_frontend {
+
+///
+/// OperationGraph_v8 Class
+/// This class tells the properties of the Tensor_v8 on which the operation will be
+/// performed
+/// Properties:
+///    - handle
+///    - operation
+///
+/// Use OperationGraphBuilder_v8 to build this class.
+/// Describe returns a string describing the tensor class
+///
+class OperationGraph_v8 : public BackendDescriptor {
+   public:
+    friend class OperationGraphBuilder_v8;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR has " << numOps << " operations." << std::endl;
+        ss << "Tag: " << opGraphTag << std::endl;
+        return ss.str();
+    }
+
+    OperationGraph_v8(OperationGraph_v8 &&from) = default;
+    OperationGraph_v8 &
+    operator=(OperationGraph_v8 &&from) = default;
+
+    ~OperationGraph_v8() = default;
+
+    /** @defgroup OperationGraphQuery
+     *  Query individual property of OperationGraph_v8 class
+     *  @{
+     */
+    //! Query the total count of the engines for the Operation Set
+    auto
+    getEngineCount(void) const -> int64_t {
+        int64_t global_count = -1;
+        auto status          = detail::get_attribute(pointer->get_backend_descriptor(),
+                                            CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
+                                            CUDNN_TYPE_INT64,
+                                            1,
+                                            nullptr,
+                                            &global_count);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(this,
+                                          status,
+                                          "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: GetAttribute "
+                                          "CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT Failed");
+        }
+        return global_count;
+    }
+    /** @} */
+
+    uint64_t
+    getOpCount() const {
+        return numOps;
+    }
+
+    std::string const &
+    getTag() const {
+        return opGraphTag;
+    }
+
+    bool
+    setFeatureVector(feature_vector_t fv) {
+        feature_vectors.push_back(fv);
+        return true;
+    }
+
+    feature_vector_t
+    getFeatureVector() const {
+        if (feature_vectors.size() != 0) {
+            return feature_vectors[0];
+        } else {
+            return {};
+        }
+    }
+
+    const std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> &
+    getOps() const {
+        return ops;
+    }
+
+   private:
+    OperationGraph_v8()                          = default;
+    OperationGraph_v8(OperationGraph_v8 const &) = delete;
+    OperationGraph_v8 &
+    operator=(OperationGraph_v8 const &) = delete;
+
+    cudnnHandle_t handle = nullptr;
+    std::array<ManagedOpaqueDescriptor, MAX_OPGRAPH_OPS> ops{};
+    int64_t numOps         = -1;
+    std::string opGraphTag = "";
+    std::vector<feature_vector_t> feature_vectors;
+    bool is_dynamic_shape_enabled = false;
+};
+
+///
+/// OperationGraphBuilder_v8 Class
+/// Helper class used to build OperationGraph_v8 class
+class OperationGraphBuilder_v8 {
+   public:
+    /** @defgroup OperationGraphBuilder_v8
+     *  Set individual property of OperationGraph_v8 class
+     *  @{
+     */
+    //! Set cudnnHandle for the operations
+    auto
+    setHandle(cudnnHandle_t handle_) -> OperationGraphBuilder_v8 & {
+        m_operationGraph.handle = handle_;
+        return *this;
+    }
+    //! Set numoperations and the operations
+    auto
+    setOperationGraph(int64_t numOps_, Operation_v8 const **ops_) -> OperationGraphBuilder_v8 & {
+        m_operationGraph.numOps = numOps_;
+        m_operationGraph.feature_vectors.resize(static_cast<size_t>(numOps_));
+        for (auto i = 0u; i < numOps_; i++) {
+            m_operationGraph.ops[i] = ops_[i]->get_desc();
+            m_operationGraph.opGraphTag += ops_[i]->getTag() + '_';
+            m_operationGraph.feature_vectors[i] = ops_[i]->getFeatureVector();
+        }
+        return *this;
+    }
+
+    //! Set numoperations and the operations
+    auto
+    setOperationGraph(std::vector<Operation> const &ops_) -> OperationGraphBuilder_v8 & {
+        m_operationGraph.numOps = ops_.size();
+        m_operationGraph.feature_vectors.resize(ops_.size());
+        for (auto i = 0u; i < ops_.size(); i++) {
+            m_operationGraph.ops[i] = ops_[i].get_desc();
+            m_operationGraph.opGraphTag += ops_[i].getTag() + '_';
+            m_operationGraph.feature_vectors[i] = ops_[i].getFeatureVector();
+        }
+        return *this;
+    }
+
+    auto
+    addOperation(ManagedOpaqueDescriptor desc) -> OperationGraphBuilder_v8 & {
+        m_operationGraph.ops[m_operationGraph.numOps] = desc;
+        ++m_operationGraph.numOps;
+        return *this;
+    }
+    /** @} */
+
+    auto
+    setIsDynamicShapeEnabled(bool is_enabled) -> OperationGraphBuilder_v8 & {
+        m_operationGraph.is_dynamic_shape_enabled = is_enabled;
+        return *this;
+    }
+
+    //! constructs the OperationGraph_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    OperationGraph_v8 &&
+    build() {
+        if (m_operationGraph.numOps <= 0) {
+            set_error_and_throw_exception(
+                &m_operationGraph,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: Check and Set the CUDNN_ATTR_OPERATIONGRAPH_OPS Count field");
+            return std::move(m_operationGraph);
+        }
+        if (m_operationGraph.ops[0] == nullptr) {
+            set_error_and_throw_exception(
+                &m_operationGraph,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: Check and set CUDNN_ATTR_OPERATIONGRAPH_OPS field");
+            return std::move(m_operationGraph);
+        }
+// handle is not a must-have after cudnn 9.8.0
+#if (CUDNN_VERSION < 90800)
+        if (m_operationGraph.handle == nullptr) {
+            set_error_and_throw_exception(
+                &m_operationGraph,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: Check and Set CUDNN_ATTR_OPERATIONGRAPH_HANDLE");
+            return std::move(m_operationGraph);
+        }
+#endif
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_operationGraph.initialize_managed_backend_pointer(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operationGraph, status, "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_operationGraph);
+        }
+
+        std::array<cudnnBackendDescriptor_t, 50> ops_raw{nullptr};
+        for (auto i = 0u; i < m_operationGraph.numOps; i++) {
+            ops_raw[i] = m_operationGraph.ops[i]->get_backend_descriptor();
+        }
+
+        status = detail::set_attribute(m_operationGraph.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_OPERATIONGRAPH_OPS,
+                                       CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                       m_operationGraph.numOps,
+                                       ops_raw.data());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operationGraph,
+                status,
+                "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: SetAttribute CUDNN_ATTR_OPERATIONGRAPH_OPS Failed");
+            return std::move(m_operationGraph);
+        }
+
+        if (m_operationGraph.handle != nullptr) {
+            status = detail::set_attribute(m_operationGraph.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
+                                           CUDNN_TYPE_HANDLE,
+                                           1,
+                                           &m_operationGraph.handle);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_operationGraph,
+                    status,
+                    "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: SetAttribute CUDNN_ATTR_OPERATIONGRAPH_HANDLE Failed");
+                return std::move(m_operationGraph);
+            }
+        }
+
+#if (CUDNN_VERSION >= 90400)
+        if (m_operationGraph.is_dynamic_shape_enabled) {
+            status = detail::set_attribute(m_operationGraph.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED,
+                                           CUDNN_TYPE_BOOLEAN,
+                                           1,
+                                           &m_operationGraph.is_dynamic_shape_enabled);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_operationGraph,
+                                              status,
+                                              "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: SetAttribute "
+                                              "CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED Failed");
+                return std::move(m_operationGraph);
+            }
+        }
+#endif
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_operationGraph.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_operationGraph, status, "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_operationGraph);
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_operationGraph);
+        return std::move(m_operationGraph);
+    }
+
+    explicit OperationGraphBuilder_v8()                        = default;
+    ~OperationGraphBuilder_v8()                                = default;
+    OperationGraphBuilder_v8(OperationGraphBuilder_v8 &&)      = delete;
+    OperationGraphBuilder_v8(OperationGraphBuilder_v8 const &) = delete;
+    OperationGraphBuilder_v8 &
+    operator=(OperationGraphBuilder_v8 const &) = delete;
+
+   private:
+    OperationGraph_v8 m_operationGraph;
+};
+
+using OperationGraph        = OperationGraph_v8;
+using OperationGraphBuilder = OperationGraphBuilder_v8;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_PointWiseDesc.h b/third_party/cudnn-frontend/include/cudnn_frontend_PointWiseDesc.h
new file mode 100644
index 00000000..87bfbd2f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_PointWiseDesc.h
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace cudnn_frontend {
+namespace graph {
+class PointwiseNode;
+}
+}  // namespace cudnn_frontend
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <limits>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+///
+/// PointWiseDesc  Descriptor Class
+/// This class tells the properties of the PointWise operation
+/// Properties:
+///    - compute_type
+///    - mode
+///    - nan_propagation
+///    - upper_clip
+///    - lower_clip
+///    - lower_clip_slope
+///    - elu_alpha
+///    - softplus_beta
+///    - swish_beta
+///
+/// Use PointWiseDesc_v8 to build this class.
+/// Describe returns a string describing the PointWise operation
+///
+class PointWiseDesc_v8 : public BackendDescriptor {
+   public:
+    friend class PointWiseDescBuilder_v8;
+    friend class cudnn_frontend::graph::PointwiseNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "CUDNN_BACKEND_POINTWISE_DESCRIPTOR :" << " Mode: " << json{mode} << " Math precision "
+           << json{compute_type};
+#else
+        ss << "CUDNN_BACKEND_POINTWISE_DESCRIPTOR :" << " Mode: " << int(mode) << " Math precision "
+           << int(compute_type);
+#endif
+        return ss.str();
+    }
+
+    int64_t
+    getPortCount() const {
+        return get_pointwise_mode_port_count(mode);
+    }
+
+    PointwiseMode_t
+    getPointWiseMode() const {
+        return mode;
+    }
+
+    PointWiseDesc_v8(PointWiseDesc_v8 &&from) = default;
+    PointWiseDesc_v8 &
+    operator=(PointWiseDesc_v8 &&from) = default;
+
+    ~PointWiseDesc_v8() = default;
+
+   private:
+    PointWiseDesc_v8()                         = default;
+    PointWiseDesc_v8(PointWiseDesc_v8 const &) = delete;
+    PointWiseDesc_v8 &
+    operator=(PointWiseDesc_v8 const &) = delete;
+
+    DataType_t compute_type               = DataType_t::FLOAT;
+    PointwiseMode_t mode                  = PointwiseMode_t::NOT_SET;
+    cudnnNanPropagation_t nan_propagation = CUDNN_PROPAGATE_NAN;
+    double upper_clip                     = std::numeric_limits<double>::max();
+    double lower_clip                     = 0.0;
+    double lower_clip_slope               = 0.0;
+    double elu_alpha                      = 1.0;
+    double softplus_beta                  = 1.0;
+    double swish_beta                     = 1.0;
+    int64_t axis                          = -1;
+};
+
+////
+/// PointWiseDescBuilder_v8 Class
+/// Helper class used to build PointWiseDesc_v8 class
+class PointWiseDescBuilder_v8 {
+   public:
+    /** @defgroup PointWiseDescBuilder_v8
+     *  Set individual property of PointWiseDesc_v8 class
+     *  @{
+     */
+    //! Set Math Precision Data Type for the Convolution Operation
+    auto
+    setComputeType(DataType_t data_type_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.compute_type = data_type_;
+        return *this;
+    }
+    auto
+    setComputeType(cudnnDataType_t data_type_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.compute_type = detail::convert_from_cudnn_type(data_type_);
+        return *this;
+    }
+    //! Set upper and lower limits for the RELU activation
+    auto
+    setClipping(double l, double u) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.upper_clip = u;
+        m_pointWiseDesc.lower_clip = l;
+        return *this;
+    }
+    //! Set pointwise mode for the activation
+    auto
+    setMode(PointwiseMode_t mode) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.mode = mode;
+        return *this;
+    }
+
+    //! To be deprecated. Please use setMode(PointwiseMode_t).
+    auto
+    setMode(cudnnPointwiseMode_t mode) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.mode = detail::convert_from_cudnn_type(mode);
+        return *this;
+    }
+
+    //! Set NaN propagation mode
+    auto
+    setMode(cudnnNanPropagation_t nan_mode_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.nan_propagation = nan_mode_;
+        return *this;
+    }
+    /** @} */
+
+    // TODO Deprecate in v1.0
+    auto
+    setMathPrecision(cudnnDataType_t data_type_) -> PointWiseDescBuilder_v8 & {
+        return setComputeType(data_type_);
+    }
+
+    auto
+    setReluLowerClip(double lower_clip_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.lower_clip = lower_clip_;
+        return *this;
+    }
+
+    auto
+    setReluUpperClip(double upper_clip_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.upper_clip = upper_clip_;
+        return *this;
+    }
+
+    auto
+    setReluLowerClipSlope(double lower_clip_slope_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.lower_clip_slope = lower_clip_slope_;
+        return *this;
+    }
+
+    auto
+    setEluAlpha(double elu_alpha_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.elu_alpha = elu_alpha_;
+        return *this;
+    }
+
+    auto
+    setSoftplusBeta(double softplus_beta_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.softplus_beta = softplus_beta_;
+        return *this;
+    }
+
+    auto
+    setSwishBeta(double swish_beta_) -> PointWiseDescBuilder_v8 & {
+        m_pointWiseDesc.swish_beta = swish_beta_;
+        return *this;
+    }
+
+    auto
+    setAxis(int64_t axis_) -> PointWiseDescBuilder_v8 & {
+        CUDNN_FRONTEND_UNUSED(axis_);
+        m_pointWiseDesc.axis = axis_;
+        return *this;
+    }
+
+    //! constructs the PointWiseDesc_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    PointWiseDesc_v8
+    build() {
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_pointWiseDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_POINTWISE_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_pointWiseDesc, status, "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_pointWiseDesc);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        cudnnPointwiseMode_t cudnn_pointwise_mode;
+        status = detail::convert_to_cudnn_type(m_pointWiseDesc.mode, cudnn_pointwise_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_pointWiseDesc,
+                status,
+                "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_TYPE_POINTWISE_MODE Failed");
+            return std::move(m_pointWiseDesc);
+        }
+        status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_POINTWISE_MODE,
+                                       CUDNN_TYPE_POINTWISE_MODE,
+                                       1,
+                                       &cudnn_pointwise_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_pointWiseDesc,
+                status,
+                "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_TYPE_POINTWISE_MODE Failed");
+            return std::move(m_pointWiseDesc);
+        }
+
+        cudnnDataType_t cudnn_data_type;
+        status = detail::convert_to_cudnn_type(m_pointWiseDesc.compute_type, cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_pointWiseDesc,
+                status,
+                "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_MATH_PREC Failed");
+            return std::move(m_pointWiseDesc);
+        }
+        status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_POINTWISE_MATH_PREC,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_pointWiseDesc,
+                status,
+                "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_MATH_PREC Failed");
+            return std::move(m_pointWiseDesc);
+        }
+
+        if (m_pointWiseDesc.mode == PointwiseMode_t::RELU_FWD || m_pointWiseDesc.mode == PointwiseMode_t::RELU_BWD) {
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_NAN_PROPAGATION,
+                                           CUDNN_TYPE_NAN_PROPOGATION,
+                                           1,
+                                           &m_pointWiseDesc.nan_propagation);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_NAN_PROPAGATION Failed");
+                return std::move(m_pointWiseDesc);
+            }
+
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP,
+                                           CUDNN_TYPE_DOUBLE,
+                                           1,
+                                           &m_pointWiseDesc.lower_clip);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+
+            if (m_pointWiseDesc.compute_type == DataType_t::FLOAT) {
+                double clamped_upper_clip =
+                    std::min<double>(m_pointWiseDesc.upper_clip, std::numeric_limits<float>::max());
+                status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &clamped_upper_clip);
+
+            } else {
+                status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &m_pointWiseDesc.upper_clip);
+            }
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE,
+                                           CUDNN_TYPE_DOUBLE,
+                                           1,
+                                           &m_pointWiseDesc.lower_clip_slope);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(&m_pointWiseDesc,
+                                              status,
+                                              "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute "
+                                              "CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+        } else if (m_pointWiseDesc.mode == PointwiseMode_t::ELU_FWD ||
+                   m_pointWiseDesc.mode == PointwiseMode_t::ELU_BWD) {
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_ELU_ALPHA,
+                                           CUDNN_TYPE_DOUBLE,
+                                           1,
+                                           &m_pointWiseDesc.elu_alpha);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_ELU_ALPHA, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+        } else if (m_pointWiseDesc.mode == PointwiseMode_t::SOFTPLUS_FWD ||
+                   m_pointWiseDesc.mode == PointwiseMode_t::SOFTPLUS_BWD) {
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA,
+                                           CUDNN_TYPE_DOUBLE,
+                                           1,
+                                           &m_pointWiseDesc.softplus_beta);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+        } else if (m_pointWiseDesc.mode == PointwiseMode_t::SWISH_FWD ||
+                   m_pointWiseDesc.mode == PointwiseMode_t::SWISH_BWD) {
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_SWISH_BETA,
+                                           CUDNN_TYPE_DOUBLE,
+                                           1,
+                                           &m_pointWiseDesc.swish_beta);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_SWISH_BETA, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+        } else if (m_pointWiseDesc.mode == PointwiseMode_t::GEN_INDEX) {
+            status = detail::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_POINTWISE_AXIS,
+                                           CUDNN_TYPE_INT64,
+                                           1,
+                                           &m_pointWiseDesc.axis);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_pointWiseDesc,
+                    status,
+                    "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_AXIS, Failed");
+                return std::move(m_pointWiseDesc);
+            }
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_pointWiseDesc.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_pointWiseDesc, status, "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_pointWiseDesc);
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_pointWiseDesc);
+        return std::move(m_pointWiseDesc);
+    }
+
+    explicit PointWiseDescBuilder_v8()                       = default;
+    ~PointWiseDescBuilder_v8()                               = default;
+    PointWiseDescBuilder_v8(PointWiseDescBuilder_v8 &&)      = delete;
+    PointWiseDescBuilder_v8(PointWiseDescBuilder_v8 const &) = delete;
+    PointWiseDescBuilder_v8 &
+    operator=(PointWiseDescBuilder_v8 const &) = delete;
+
+   private:
+    PointWiseDesc_v8 m_pointWiseDesc;
+};
+using PointWiseDescBuilder = PointWiseDescBuilder_v8;
+using PointWiseDesc        = PointWiseDesc_v8;
+using PointwiseDescBuilder = PointWiseDescBuilder_v8;
+using PointwiseDesc        = PointWiseDesc_v8;
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_ReductionDesc.h b/third_party/cudnn-frontend/include/cudnn_frontend_ReductionDesc.h
new file mode 100644
index 00000000..627b5d0f
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_ReductionDesc.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+namespace cudnn_frontend {
+namespace graph {
+class ReductionNode;
+}
+}  // namespace cudnn_frontend
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+///
+/// ReductionDesc  Descriptor Class
+/// This class tells the properties of the Reduction operation
+/// Properties:
+///    - compute_type
+///    - reduction_mode
+///    - is_deterministic
+///
+/// Use ReductionDesc_v8 to build this class.
+/// Describe returns a string describing the Reduction operation
+///
+class ReductionDesc_v8 : public BackendDescriptor {
+   public:
+    friend class ReductionDescBuilder_v8;
+    friend class cudnn_frontend::graph::ReductionNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "CUDNN_BACKEND_REDUCTION_DESCRIPTOR :"
+           << " Math precision " << json{compute_type} << " Reduction mode " << json{reduction_mode}
+           << " Is deterministic " << json(is_deterministic);
+#else
+        ss << "CUDNN_BACKEND_REDUCTION_DESCRIPTOR :"
+           << " Math precision " << (int)compute_type << " Reduction mode " << int(reduction_mode)
+           << " Is deterministic " << int(is_deterministic);
+#endif
+        return ss.str();
+    }
+
+    ReductionDesc_v8(ReductionDesc_v8 &&from) = default;
+    ReductionDesc_v8 &
+    operator=(ReductionDesc_v8 &&from) = default;
+
+    ~ReductionDesc_v8() = default;
+
+   private:
+    ReductionDesc_v8()                         = default;
+    ReductionDesc_v8(ReductionDesc_v8 const &) = delete;
+    ReductionDesc_v8 &
+    operator=(ReductionDesc_v8 const &) = delete;
+
+    DataType_t compute_type        = DataType_t::NOT_SET;
+    ReductionMode_t reduction_mode = ReductionMode_t::NOT_SET;
+    bool is_deterministic          = false;
+};
+
+////
+/// ReductionDescBuilder_v8 Class
+/// Helper class used to build ReductionDesc_v8 class
+class ReductionDescBuilder_v8 {
+   public:
+    /** @defgroup ReductionDescBuilder_v8
+     *  Set individual property of ReductionDesc_v8 class
+     *  @{
+     */
+    //! Set Math Precision Data Type for the Reduction Operation
+    auto
+    setComputeType(DataType_t data_type_) -> ReductionDescBuilder_v8 & {
+        m_reductionDesc.compute_type = data_type_;
+        return *this;
+    }
+    auto
+    setComputeType(cudnnDataType_t data_type_) -> ReductionDescBuilder_v8 & {
+        m_reductionDesc.compute_type = detail::convert_from_cudnn_type(data_type_);
+        return *this;
+    }
+    //! Set redution operator for the Reduction Operation
+    auto
+    setReductionOp(ReductionMode_t op_) -> ReductionDescBuilder_v8 & {
+        m_reductionDesc.reduction_mode = op_;
+        return *this;
+    }
+    auto
+    setReductionOp(cudnnReduceTensorOp_t op_) -> ReductionDescBuilder_v8 & {
+        m_reductionDesc.reduction_mode = detail::convert_from_cudnn_type(op_);
+        return *this;
+    }
+    /** @} */
+
+    // TODO Deprecate in v1.0
+    auto
+    setMathPrecision(cudnnDataType_t data_type_) -> ReductionDescBuilder_v8 & {
+        return setComputeType(data_type_);
+    }
+    auto
+    setIsDeterministic(bool is_deterministic_) -> ReductionDescBuilder_v8 & {
+        m_reductionDesc.is_deterministic = is_deterministic_;
+        return *this;
+    }
+
+    //! constructs the ReductionDesc_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    ReductionDesc_v8 &&
+    build() {
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_reductionDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_REDUCTION_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_reductionDesc, status, "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_reductionDesc);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        cudnnDataType_t cudnn_data_type;
+        status = detail::convert_to_cudnn_type(m_reductionDesc.compute_type, cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_reductionDesc,
+                status,
+                "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_COMP_TYPE Failed");
+            return std::move(m_reductionDesc);
+        }
+        status = detail::set_attribute(m_reductionDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_REDUCTION_COMP_TYPE,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_reductionDesc,
+                status,
+                "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_COMP_TYPE Failed");
+            return std::move(m_reductionDesc);
+        }
+
+        cudnnReduceTensorOp_t cudnn_reduction_mode;
+        status = detail::convert_to_cudnn_type(m_reductionDesc.reduction_mode, cudnn_reduction_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_reductionDesc,
+                status,
+                "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_OPERATOR Failed");
+            return std::move(m_reductionDesc);
+        }
+        status = detail::set_attribute(m_reductionDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_REDUCTION_OPERATOR,
+                                       CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+                                       1,
+                                       &cudnn_reduction_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_reductionDesc,
+                status,
+                "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_OPERATOR Failed");
+            return std::move(m_reductionDesc);
+        }
+
+#if (CUDNN_VERSION >= 91100)
+        // If backend version is less then 9.11.0, then determinisitc mode is not even supported.
+        // But in the default case which exists in current implementations, is_deterministic is false, and should be
+        // ignored.
+        if (detail::get_backend_version() < 91100) {
+            if (m_reductionDesc.is_deterministic) {
+                set_error_and_throw_exception(&m_reductionDesc,
+                                              CUDNN_STATUS_NOT_SUPPORTED,
+                                              "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: DETERMINISTIC mode is not supported "
+                                              "in cudnn version < 9.11.0");
+                return std::move(m_reductionDesc);
+            } else {
+                // Do nothing.
+            }
+        } else {
+            status = detail::set_attribute(m_reductionDesc.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_REDUCTION_IS_DETERMINISTIC,
+                                           CUDNN_TYPE_BOOLEAN,
+                                           1,
+                                           &m_reductionDesc.is_deterministic);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_reductionDesc,
+                    status,
+                    "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_IS_DETERMINISTIC Failed");
+                return std::move(m_reductionDesc);
+            }
+        }
+#endif
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_reductionDesc.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_reductionDesc, status, "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_reductionDesc);
+        }
+
+        CUDNN_FE_LOG_LABEL_ENDL(m_reductionDesc);
+        return std::move(m_reductionDesc);
+    }
+
+    explicit ReductionDescBuilder_v8()                       = default;
+    ~ReductionDescBuilder_v8()                               = default;
+    ReductionDescBuilder_v8(ReductionDescBuilder_v8 &&)      = delete;
+    ReductionDescBuilder_v8(ReductionDescBuilder_v8 const &) = delete;
+    ReductionDescBuilder_v8 &
+    operator=(ReductionDescBuilder_v8 const &) = delete;
+
+   private:
+    ReductionDesc_v8 m_reductionDesc;
+};
+
+using ReductionDesc        = ReductionDesc_v8;
+using ReductionDescBuilder = ReductionDescBuilder_v8;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Reorder_Tensor.h b/third_party/cudnn-frontend/include/cudnn_frontend_Reorder_Tensor.h
new file mode 100644
index 00000000..6ac5a8c5
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Reorder_Tensor.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <iostream>
+#include <utility>
+
+#include "cudnn_frontend_Tensor.h"
+#include "cudnn_frontend_ConvDesc.h"
+
+namespace cudnn_frontend {
+
+[[maybe_unused]] static cudnnStatus_t
+cudnnReorderFilterAndBiasInt8x32(cudnnHandle_t handle,
+                                 const Tensor_v8 &tensor,
+                                 const ConvDesc_v8 &conv_desc,
+                                 void *dev_filter_ptr,
+                                 void *reordered_filter_ptr,
+                                 void *dev_bias_ptr,
+                                 void *reordered_bias_ptr) {
+    auto cudnn_status = CUDNN_STATUS_SUCCESS;
+
+    if (dev_filter_ptr && reordered_filter_ptr == nullptr) {
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+    if (dev_bias_ptr && reordered_bias_ptr == nullptr) {
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+
+    cudnnFilterDescriptor_t filterDesc = nullptr;
+
+    cudnn_status = detail::create_filter_desc_v7(&filterDesc);
+    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+        return cudnn_status;
+    }
+
+    auto conv_dims      = int(conv_desc.getDimensionCount());
+    auto tensor_dims    = int(tensor.getDimensionCount());
+    auto non_shape_dims = tensor_dims - conv_dims;
+
+    if (non_shape_dims != 2 && non_shape_dims != 3) {
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+
+    if (conv_dims != 2 && conv_dims != 3) {
+        return CUDNN_STATUS_BAD_PARAM;
+    }
+
+    int filter_dims_[5]        = {1, 1, 1, 1, 1};
+    int64_t const *filter_dims = tensor.getDimArray();
+    filter_dims_[0]            = static_cast<int>(filter_dims[0]);                                                // n
+    filter_dims_[1]            = static_cast<int>((non_shape_dims == 2) ? filter_dims[1] : filter_dims[2]) * 32;  // c
+    filter_dims_[2]            = static_cast<int>((non_shape_dims == 2) ? filter_dims[2] : filter_dims[3]);       // d/h
+    filter_dims_[3]            = static_cast<int>((non_shape_dims == 2) ? filter_dims[3] : filter_dims[4]);       // h/w
+    if (conv_dims == 3) {
+        filter_dims_[4] = static_cast<int>((non_shape_dims == 2) ? filter_dims[4] : filter_dims[5]);  // w
+    }
+
+    cudnn_status = detail::set_ndfilter_desc_v7(
+        filterDesc, CUDNN_DATA_INT8x32, CUDNN_TENSOR_NCHW_VECT_C, conv_dims + 2, filter_dims_);
+
+    if (cudnn_status != CUDNN_STATUS_SUCCESS) {
+        return cudnn_status;
+    }
+
+    int reorderBias = (dev_bias_ptr != nullptr);
+
+    cudnn_status = detail::reorder_filter_bias(handle,
+                                               filterDesc,
+                                               CUDNN_DEFAULT_REORDER,
+                                               dev_filter_ptr,
+                                               reordered_filter_ptr,
+                                               reorderBias,
+                                               dev_bias_ptr,
+                                               reordered_bias_ptr);
+
+    detail::destroy_filter(filterDesc);
+    return cudnn_status;
+}
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Resample.h b/third_party/cudnn-frontend/include/cudnn_frontend_Resample.h
new file mode 100644
index 00000000..a375a458
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Resample.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+namespace graph {
+class ResampleNode;
+}
+}  // namespace cudnn_frontend
+
+namespace cudnn_frontend {
+
+///
+/// Resample Descriptor Class
+/// This class tells the properties of the Resample operation
+/// Properties:
+///
+/// Use ResampleDescBuilder_v8 to build this class.
+/// Describe returns a string describing the Resample operation
+///
+class ResampleDesc_v8 : public BackendDescriptor {
+   public:
+    friend class ResampleDescBuilder_v8;
+    friend class graph::ResampleNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        char sep = ',';
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: " << "Compute Type: " << json{computeType}
+           << ", Resample Mode: " << json{resample_mode} << ", Spatial Dimensions: " << spatialDim
+           << ", Nan Propagation: " << std::to_string(nanOpt) << ", Padding Mode: " << json{padding_mode};
+#else
+        ss << "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: " << "Compute Type: " << int(computeType)
+           << ", Resample Mode: " << int(resample_mode) << ", Spatial Dimensions: " << spatialDim
+           << ", Nan Propagation: " << std::to_string(nanOpt) << ", Padding Mode: " << int(padding_mode);
+#endif
+
+        ss << ", WindowDim: [";
+        for (auto i = 0; i < spatialDim; i++) {
+            ss << '(' << windowDim[i].numerator << sep << windowDim[i].denominator << ')' << sep;
+        }
+        ss << "]";
+        ss << ", prePadding: [";
+        for (auto i = 0; i < spatialDim; i++) {
+            ss << '(' << prePadding[i].numerator << sep << prePadding[i].denominator << ')' << sep;
+        }
+        ss << "]";
+        ss << ", postPadding: [";
+        for (auto i = 0; i < spatialDim; i++) {
+            ss << '(' << postPadding[i].numerator << sep << postPadding[i].denominator << ')' << sep;
+        }
+        ss << "]";
+        ss << ", stride: [ ";
+        for (auto i = 0; i < spatialDim; i++) {
+            ss << '(' << stride[i].numerator << sep << stride[i].denominator << ')' << sep;
+        }
+        ss << "]";
+        return ss.str();
+    }
+
+    ResampleDesc_v8(ResampleDesc_v8 &&from) = default;
+    ResampleDesc_v8 &
+    operator=(ResampleDesc_v8 &&) = default;
+
+    ~ResampleDesc_v8() = default;
+
+    /** @defgroup ResampleDescBuilder_v8
+     *  Get individual property of ResampleDesc_v8 class
+     *  @{
+     */
+
+    DataType_t
+    getComputeType() const {
+        return computeType;
+    }
+
+    int64_t
+    getSpatialDimCount() const {
+        return spatialDim;
+    }
+
+    cudnnNanPropagation_t
+    getNanOpt() const {
+        return nanOpt;
+    }
+
+    ResampleMode_t
+    getMode() const {
+        return resample_mode;
+    }
+
+    PaddingMode_t
+    getPaddingMode() const {
+        return padding_mode;
+    }
+
+    cudnnFraction_t const *
+    getSpatialStride() const {
+        return stride;
+    }
+
+    cudnnFraction_t const *
+    getPrePadding() const {
+        return prePadding;
+    }
+
+    cudnnFraction_t const *
+    getPostPadding() const {
+        return postPadding;
+    }
+
+    cudnnFraction_t const *
+    getWindowDim() const {
+        return windowDim;
+    }
+    /** @} */
+
+   private:
+    ResampleDesc_v8()                        = default;
+    ResampleDesc_v8(ResampleDesc_v8 const &) = delete;
+    ResampleDesc_v8 &
+    operator=(ResampleDesc_v8 const &) = delete;
+
+    // default values for attributes
+    DataType_t computeType       = DataType_t::FLOAT;
+    cudnnNanPropagation_t nanOpt = CUDNN_PROPAGATE_NAN;
+    ResampleMode_t resample_mode = ResampleMode_t::NOT_SET;
+    PaddingMode_t padding_mode   = PaddingMode_t::NOT_SET;
+
+    int64_t spatialDim = 0;
+
+    // Shape attributes
+    cudnnFraction_t windowDim[CUDNN_DIM_MAX]   = {{0, 1}, {0, 1}};
+    cudnnFraction_t prePadding[CUDNN_DIM_MAX]  = {{0, 1}, {0, 1}};
+    cudnnFraction_t postPadding[CUDNN_DIM_MAX] = {{0, 1}, {0, 1}};
+    cudnnFraction_t stride[CUDNN_DIM_MAX]      = {{0, 1}, {0, 1}};
+};
+
+///
+/// ResampleDescBuilder_v8 Class
+/// Helper class used to build ResampleDesc_v8 class
+class ResampleDescBuilder_v8 {
+   public:
+    /** @defgroup ResampleDescBuilder_v8
+     *  Set individual property of ResampleDesc_v8 class
+     *  @{
+     */
+    //! Set compute type for the Resample Descriptor
+    auto
+    setComputeType(DataType_t data_type) -> ResampleDescBuilder_v8 & {
+        m_resampleDesc.computeType = data_type;
+        return *this;
+    }
+    // To be deprecated in v1.0.
+    auto
+    setComputeType(cudnnDataType_t data_type_) -> ResampleDescBuilder_v8 & {
+        m_resampleDesc.computeType = detail::convert_from_cudnn_type(data_type_);
+        return *this;
+    }
+
+    //! Set nan propagation mode for the Resample Operation
+    auto
+    setNanPropagation(cudnnNanPropagation_t nanOpt_) -> ResampleDescBuilder_v8 & {
+        m_resampleDesc.nanOpt = nanOpt_;
+        return *this;
+    }
+
+    //! (Overloaded) Set post padding for the Resample Operation with cudnnFraction_t
+    auto
+    setPostPadding(int64_t count, cudnnFraction_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        std::copy(arr, arr + count, m_resampleDesc.postPadding);
+        return *this;
+    }
+
+    //! (Overloaded) Set pre padding for the Resample Operation with cudnnFraction_t
+    auto
+    setPrePadding(int64_t count, cudnnFraction_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        std::copy(arr, arr + count, m_resampleDesc.prePadding);
+        return *this;
+    }
+
+    //! (Overloaded) Set stride for the Resample Operation with cudnnFraction_t
+    auto
+    setSpatialStride(int64_t count, cudnnFraction_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        std::copy(arr, arr + count, m_resampleDesc.stride);
+        return *this;
+    }
+
+    //! Set resample mode for the Resample Operation
+    // To be deprecated. Please use setResampleMode(cudnn_frontend::ResampleMode_t).
+    auto
+    setResampleMode(cudnnResampleMode_t const mode_) -> ResampleDescBuilder_v8 & {
+        detail::convert_from_cudnn_type(mode_, m_resampleDesc.resample_mode);
+        return *this;
+    }
+
+    //! (Overloaded) Set window dim for the Resample Operation with cudnnFraction_t
+    auto
+    setSpatialDim(int64_t count, cudnnFraction_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        m_resampleDesc.spatialDim = count;
+        std::copy(arr, arr + count, m_resampleDesc.windowDim);
+        return *this;
+    }
+
+    //! Set padding mode for the Resample Operation
+    // To be deprecated. Please use setPaddingMode(cudnn_frontend::PaddingMode_t).
+    auto
+    setPaddingMode(cudnnPaddingMode_t const padding_mode) -> ResampleDescBuilder_v8 & {
+        detail::convert_from_cudnn_type(padding_mode, m_resampleDesc.padding_mode);
+        return *this;
+    }
+
+    //! Set padding mode for the Resample Operation
+    auto
+    setPaddingMode(PaddingMode_t const padding_mode) -> ResampleDescBuilder_v8 & {
+        m_resampleDesc.padding_mode = padding_mode;
+        return *this;
+    }
+
+    //! Set resample mode for the Resample Operation
+    auto
+    setResampleMode(ResampleMode_t const mode) -> ResampleDescBuilder_v8 & {
+        m_resampleDesc.resample_mode = mode;
+        return *this;
+    }
+
+    //! (Overloaded) Set post padding for the Resample Operation with int64_t
+    auto
+    setPostPadding(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        for (int i = 0; i < count; i++) {
+            m_resampleDesc.postPadding[i].numerator   = arr[i];
+            m_resampleDesc.postPadding[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    //! (Overloaded) Set pre padding for the Resample Operation with int64_t
+    auto
+    setPrePadding(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        for (int i = 0; i < count; i++) {
+            m_resampleDesc.prePadding[i].numerator   = arr[i];
+            m_resampleDesc.prePadding[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    //! (Overloaded) Set stride for the Resample Operation with int64_t
+    auto
+    setSpatialStride(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        for (int i = 0; i < count; i++) {
+            m_resampleDesc.stride[i].numerator   = arr[i];
+            m_resampleDesc.stride[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    //! (Overloaded) Set window dim for the Resample Operation with int64_t
+    auto
+    setSpatialDim(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
+        // TODO: check the provided array count against the stored spatial dimension count.
+        m_resampleDesc.spatialDim = count;
+        for (int i = 0; i < count; i++) {
+            m_resampleDesc.windowDim[i].numerator   = arr[i];
+            m_resampleDesc.windowDim[i].denominator = 1;
+        }
+        return *this;
+    }
+
+    /** @} */
+
+    //! constructs the ResampleDesc_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    ResampleDesc_v8 &&
+    build() {
+        // Sanity check if non-default fields have been set correctly.
+        if (m_resampleDesc.spatialDim < 0) {
+            set_error_and_throw_exception(&m_resampleDesc,
+                                          CUDNN_STATUS_BAD_PARAM,
+                                          "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: Check and Set the spatialDim field");
+            return std::move(m_resampleDesc);
+        };
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_resampleDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_RESAMPLE_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc, status, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        ::cudnnResampleMode_t cudnn_resample_mode;
+        status = detail::convert_to_cudnn_type(m_resampleDesc.resample_mode, cudnn_resample_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_MODE Failed");
+            return std::move(m_resampleDesc);
+        }
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_MODE,
+                                       CUDNN_TYPE_RESAMPLE_MODE,
+                                       1,
+                                       &cudnn_resample_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_MODE Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        cudnnDataType_t cudnn_data_type;
+        status = detail::convert_to_cudnn_type(m_resampleDesc.computeType, cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_COMP_TYPE Failed");
+            return std::move(m_resampleDesc);
+        }
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_COMP_TYPE,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_COMP_TYPE Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION,
+                                       CUDNN_TYPE_NAN_PROPOGATION,
+                                       1,
+                                       &(m_resampleDesc.nanOpt));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        cudnnPaddingMode_t cudnn_padding_mode;
+        status = detail::convert_to_cudnn_type(m_resampleDesc.padding_mode, cudnn_padding_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_PADDING_MODE Failed");
+            return std::move(m_resampleDesc);
+        }
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_PADDING_MODE,
+                                       CUDNN_TYPE_PADDING_MODE,
+                                       1,
+                                       &cudnn_padding_mode);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_PADDING_MODE Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS,
+                                       CUDNN_TYPE_INT64,
+                                       1,
+                                       &(m_resampleDesc.spatialDim));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_WINDOW_DIMS,
+                                       CUDNN_TYPE_FRACTION,
+                                       m_resampleDesc.spatialDim,
+                                       m_resampleDesc.windowDim);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_WINDOW_DIMS Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_PRE_PADDINGS,
+                                       CUDNN_TYPE_FRACTION,
+                                       m_resampleDesc.spatialDim,
+                                       m_resampleDesc.prePadding);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_PRE_PADDINGS Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_POST_PADDINGS,
+                                       CUDNN_TYPE_FRACTION,
+                                       m_resampleDesc.spatialDim,
+                                       m_resampleDesc.postPadding);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_POST_PADDINGS Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        status = detail::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RESAMPLE_STRIDES,
+                                       CUDNN_TYPE_FRACTION,
+                                       m_resampleDesc.spatialDim,
+                                       m_resampleDesc.stride);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc,
+                status,
+                "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_STRIDES Failed");
+            return std::move(m_resampleDesc);
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_resampleDesc.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_resampleDesc, status, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_resampleDesc);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL(m_resampleDesc);
+        return std::move(m_resampleDesc);
+    }
+
+    explicit ResampleDescBuilder_v8()                      = default;
+    ~ResampleDescBuilder_v8()                              = default;
+    ResampleDescBuilder_v8(ResampleDescBuilder_v8 &&)      = delete;
+    ResampleDescBuilder_v8(ResampleDescBuilder_v8 const &) = delete;
+    ResampleDescBuilder_v8 &
+    operator=(ResampleDescBuilder_v8 const &) = delete;
+
+   private:
+    ResampleDesc_v8 m_resampleDesc;
+};
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Rng.h b/third_party/cudnn-frontend/include/cudnn_frontend_Rng.h
new file mode 100644
index 00000000..829f8bde
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Rng.h
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+namespace graph {
+class RngNode;
+}
+}  // namespace cudnn_frontend
+
+namespace cudnn_frontend {
+
+///
+/// Rng Descriptor Class
+/// This class tells the properties of the Rng operation
+/// Properties:
+///
+/// Use RngDescBuilder_v8 to build this class.
+/// Describe returns a string describing the Rng operation
+///
+class RngDesc_v8 : public BackendDescriptor {
+   public:
+    friend class RngDescBuilder_v8;
+    friend class graph::RngNode;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+#if (CUDNN_VERSION >= 8700)
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "CUDNN_BACKEND_RNG_DESCRIPTOR: " << "Distribution Type: " << json{distribution}
+#else
+        ss << "CUDNN_BACKEND_RNG_DESCRIPTOR: " << "Distribution Type: " << int(distribution)
+#endif
+           << ", Normal Distribution Mean: " << normal_dist_mean
+           << ", Normal Distribution Standard Deviation: " << normal_dist_std_dev
+           << ", Uniform Distribution Maximum: " << uniform_dist_max
+           << ", Uniform Distribution Minimum: " << uniform_dist_min
+           << ", Bernoulli Distribution Probability: " << bernoulli_dist_probability;
+#endif
+        return ss.str();
+    }
+
+    RngDesc_v8(RngDesc_v8 &&from) = default;
+    RngDesc_v8 &
+    operator=(RngDesc_v8 &&) = default;
+
+    ~RngDesc_v8() = default;
+
+    /** @defgroup RngDescBuilder_v8
+     *  Get individual property of RngDesc_v8 class
+     *  @{
+     */
+
+    double
+    getNormalDistMean() const {
+        return normal_dist_mean;
+    }
+
+    double
+    getNormalDistStdDev() const {
+        return normal_dist_std_dev;
+    }
+
+    double
+    getUniformDistMax() const {
+        return uniform_dist_max;
+    }
+
+    double
+    getUniformDistMin() const {
+        return normal_dist_std_dev;
+    }
+
+    double
+    getBernoulliDistProbability() const {
+        return bernoulli_dist_probability;
+    }
+
+    RngDistribution_t
+    getDistribution() const {
+        return distribution;
+    }
+
+    /** @} */
+
+   private:
+    RngDesc_v8()                   = default;
+    RngDesc_v8(RngDesc_v8 const &) = delete;
+    RngDesc_v8 &
+    operator=(RngDesc_v8 const &) = delete;
+
+    // default values for attributes
+    double normal_dist_mean           = -1;
+    double normal_dist_std_dev        = -1;
+    double uniform_dist_max           = -1;
+    double uniform_dist_min           = -1;
+    double bernoulli_dist_probability = -1;
+
+    RngDistribution_t distribution = RngDistribution_t::NOT_SET;
+};
+
+///
+/// RngDescBuilder_v8 Class
+/// Helper class used to build RngDesc_v8 class
+class RngDescBuilder_v8 {
+   public:
+    /** @defgroup RngDescBuilder_v8
+     *  Set individual property of RngDesc_v8 class
+     *  @{
+     */
+
+    //! Set Rng distribution for the Rng Operation
+    auto
+    setRngDistribution(RngDistribution_t distribution_) -> RngDescBuilder_v8 & {
+        m_RngDesc.distribution = distribution_;
+        return *this;
+    }
+
+#if (CUDNN_VERSION >= 8700)
+    //! Set Rng distribution for the Rng Operation
+    auto
+    setRngDistribution(cudnnRngDistribution_t distribution_) -> RngDescBuilder_v8 & {
+        m_RngDesc.distribution = detail::convert_from_cudnn_type(distribution_);
+        return *this;
+    }
+
+#endif
+
+    //! Set normal distribution params (mean and std dev) for the Rng Operation
+    auto
+    setNormalDistParams(double normal_dist_mean_, double normal_dist_std_dev_) -> RngDescBuilder_v8 & {
+        m_RngDesc.normal_dist_mean    = normal_dist_mean_;
+        m_RngDesc.normal_dist_std_dev = normal_dist_std_dev_;
+        return *this;
+    }
+
+    //! Set normal distribution mean for the Rng Operation
+    auto
+    setNormalDistMean(double normal_dist_mean_) -> RngDescBuilder_v8 & {
+        m_RngDesc.normal_dist_mean = normal_dist_mean_;
+        return *this;
+    }
+
+    //! Set normal distribution std dev for the Rng Operation
+    auto
+    setNormalDistStdDev(double normal_dist_std_dev_) -> RngDescBuilder_v8 & {
+        m_RngDesc.normal_dist_std_dev = normal_dist_std_dev_;
+        return *this;
+    }
+
+    //! Set uniform distribution params (min and max) for the Rng Operation
+    auto
+    setUniformDistParams(double uniform_dist_max_, double uniform_dist_min_) -> RngDescBuilder_v8 & {
+        m_RngDesc.uniform_dist_max = uniform_dist_max_;
+        m_RngDesc.uniform_dist_min = uniform_dist_min_;
+        return *this;
+    }
+
+    //! Set uniform distribution max for the Rng Operation
+    auto
+    setUniformDistMax(double uniform_dist_max_) -> RngDescBuilder_v8 & {
+        m_RngDesc.uniform_dist_max = uniform_dist_max_;
+        return *this;
+    }
+
+    //! Set uniform distribution min for the Rng Operation
+    auto
+    setUniformDistMin(double uniform_dist_min_) -> RngDescBuilder_v8 & {
+        m_RngDesc.uniform_dist_min = uniform_dist_min_;
+        return *this;
+    }
+
+    //! Set bernoulli distribution probability for the Rng Operation
+    auto
+    setBernoulliDistProbability(double bernoulli_dist_probability_) -> RngDescBuilder_v8 & {
+        m_RngDesc.bernoulli_dist_probability = bernoulli_dist_probability_;
+        return *this;
+    }
+
+    /** @} */
+
+    //! constructs the RngDesc_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    RngDesc_v8 &&
+    build() {
+#if (CUDNN_VERSION >= 8700)
+        // Create a descriptor. Memory allocation happens here.
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_RngDesc, "CUDNN_BACKEND_RNG_DESCRIPTOR: Requires cudnn 8.7.0");
+
+        auto status = m_RngDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_RNG_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_RngDesc);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        cudnnRngDistribution_t cudnn_rng_distribution;
+        status = detail::convert_to_cudnn_type(m_RngDesc.distribution, cudnn_rng_distribution);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_DISTRIBUTION Failed");
+            return std::move(m_RngDesc);
+        }
+
+        status = detail::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RNG_DISTRIBUTION,
+                                       CUDNN_TYPE_RNG_DISTRIBUTION,
+                                       1,
+                                       &cudnn_rng_distribution);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_DISTRIBUTION Failed");
+            return std::move(m_RngDesc);
+        }
+
+        status = detail::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RNG_NORMAL_DIST_MEAN,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_RngDesc.normal_dist_mean));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc,
+                status,
+                "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_NORMAL_DIST_MEAN Failed");
+            return std::move(m_RngDesc);
+        }
+
+        status = detail::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_RngDesc.normal_dist_std_dev));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc,
+                status,
+                "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION Failed");
+            return std::move(m_RngDesc);
+        }
+
+        status = detail::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_RngDesc.uniform_dist_max));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc,
+                status,
+                "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM Failed");
+            return std::move(m_RngDesc);
+        }
+
+        status = detail::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_RngDesc.uniform_dist_min));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc,
+                status,
+                "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM Failed");
+            return std::move(m_RngDesc);
+        }
+
+        status = detail::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY,
+                                       CUDNN_TYPE_DOUBLE,
+                                       1,
+                                       &(m_RngDesc.bernoulli_dist_probability));
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_RngDesc,
+                status,
+                "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY Failed");
+            return std::move(m_RngDesc);
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_RngDesc.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_RngDesc);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL(m_RngDesc);
+        return std::move(m_RngDesc);
+#else
+        set_error_and_throw_exception(
+            &m_RngDesc, CUDNN_STATUS_NOT_SUPPORTED, "CUDNN_RNG_DESCRIPTOR: Rng only supported in cuDNN v8.7 or later");
+        return std::move(m_RngDesc);
+#endif
+    }
+
+    explicit RngDescBuilder_v8()                 = default;
+    ~RngDescBuilder_v8()                         = default;
+    RngDescBuilder_v8(RngDescBuilder_v8 &&)      = delete;
+    RngDescBuilder_v8(RngDescBuilder_v8 const &) = delete;
+    RngDescBuilder_v8 &
+    operator=(RngDescBuilder_v8 const &) = delete;
+
+   private:
+    RngDesc_v8 m_RngDesc;
+};
+
+using RngDesc        = RngDesc_v8;
+using RngDescBuilder = RngDescBuilder_v8;
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_Tensor.h b/third_party/cudnn-frontend/include/cudnn_frontend_Tensor.h
new file mode 100644
index 00000000..91585003
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_Tensor.h
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+
+///
+/// Tensor_v8 Class
+/// This class tells the properties of the Tensor_v8 on which the operation will be
+/// performed
+/// Properties:
+///    - dataType
+///    - alignment
+///    - unique identifier
+///    - tensor dimensions
+///    - tensor strides
+///    - isVirtual
+///    - isByValue
+///
+/// Use TensorBuilder_v8 to build this class.
+/// Describe returns a string describing the tensor class
+///
+class Tensor_v8 : public BackendDescriptor {
+   public:
+    friend class TensorBuilder_v8;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "               CUDNN_BACKEND_TENSOR_DESCRIPTOR :\n"
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+           << "                                   Datatype: " << json{data_type}
+#else
+           << "                                   Datatype: " << int(data_type)
+#endif
+           << "\n                                   Id: " << id << "\n                                   Dims " << nDims
+           << "\n                                   VectorCount: " << vectorCount
+           << "\n                                   vectorDimension " << vectorDimension;
+        ss << "\n                                   Dim [ ";
+        for (auto i = 0; i < nDims; i++) {
+            if (i != 0) {
+                ss << ',';
+            }
+            ss << btensor_dimA[i];
+        }
+        ss << " ]\n                                   Str [ ";
+        for (auto i = 0; i < nDims; i++) {
+            if (i != 0) {
+                ss << ',';
+            }
+            ss << btensor_strA[i];
+        }
+        ss << " ]";
+        ss << "\n                                   isVirtual: " << isVirtual
+           << "\n                                   isByValue: " << isByValue
+           << "\n                                   Alignment: " << alignment;
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+        ss << "\n                                   reorder_type: " << json{reorder_type};
+#else
+        ss << "\n                                   reorder_type: " << int(reorder_type);
+#endif
+        if (raggedOffset != nullptr) {
+            ss << "\n                                   raggedOffset: Enabled UID: " << raggedOffset->getId();
+        }
+        return ss.str();
+    }
+
+    int64_t
+    getPackedElementCount() const {
+        int64_t count = vectorCount;
+        for (auto i = 0; i < nDims; i++) {
+            count = count * btensor_dimA[i];
+        }
+        return count;
+    };
+
+    int64_t
+    getDimCount() const {
+        return nDims;
+    }
+
+    int64_t const *
+    getDim() const {
+        return btensor_dimA;
+    }
+
+    int64_t const *
+    getStride() const {
+        return btensor_strA;
+    }
+
+    // TODO: Deprecate in v1.0
+    int64_t const *
+    getDimArray() const {
+        return getDim();
+    }
+
+    // TODO: Deprecate in v1.0
+    int64_t const *
+    getStrideArray() const {
+        return getStride();
+    }
+
+    int64_t
+    getDataType() const {
+        return static_cast<int64_t>(data_type);
+    }
+
+    int64_t
+    getId() const {
+        return id;
+    }
+
+    int64_t
+    getAlignment() const {
+        return alignment;
+    }
+
+    bool
+    isVirtualTensor() const {
+        return isVirtual;
+    }
+
+    // TODO: Deprecate in v1.0
+    int64_t
+    getDimensionCount() const {
+        return getDimCount();
+    }
+
+    Tensor_v8(Tensor_v8 &&from) = default;
+    Tensor_v8 &
+    operator=(Tensor_v8 &&) = default;
+
+    ~Tensor_v8() = default;
+
+   private:
+    Tensor_v8()                  = default;
+    Tensor_v8(Tensor_v8 const &) = delete;
+    Tensor_v8 &
+    operator=(Tensor_v8 const &) = delete;
+
+    DataType_t data_type                    = DataType_t::NOT_SET;  //! Datatype of the elements
+    int64_t btensor_dimA[CUDNN_DIM_MAX + 1] = {-1};                 //! n, g, c, d, h, w
+    int64_t btensor_strA[CUDNN_DIM_MAX + 1] = {-1};                 //! n, g, c, d, h, w
+    int64_t id                              = -1;                   //! Unique id of the tensor
+    int64_t alignment                       = -1;                   //! Alignment of the tensor.
+    //! Certain engine config expect minimum alignment of 16B
+    int64_t nDims           = -1;     //! Number of Dimensions of the tensor
+    int64_t vectorDimension = -1;     //! Which dimension of the tensor is vectorized (Generally the c dim)
+    int64_t vectorCount     = 1;      //! What is the vectorization count (4 or 32)
+    bool isVirtual          = false;  //! Whether it is an intermediate tensor of an op graph
+    bool isByValue = false;  //! Whether the tensor is in host memory that needs to be passed to the kernel by value
+    cudnn_frontend::TensorReordering_t reorder_type =
+        cudnn_frontend::TensorReordering_t::NONE;  //! Type of reordering in the tensor
+    std::shared_ptr<Tensor_v8> raggedOffset;       //! Ragged offsets for ragged tensors
+};
+
+///
+/// TensorBuilder_v8 Class
+/// Helper class used to build Tensor_v8 class
+class TensorBuilder_v8 {
+   public:
+    /** @defgroup TensorBuilder_v8
+     *  Set individual property of Tensor_v8 class
+     *  @{
+     */
+    //! Set Datatype for the Tensor_v8
+    auto
+    setDataType(DataType_t data_type) -> TensorBuilder_v8 & {
+        m_tensor.data_type = data_type;
+        return *this;
+    }
+    // To be deprecated in v1.0. Please use setDataType(DataType_t) instead.
+    auto
+    setDataType(cudnnDataType_t data_type) -> TensorBuilder_v8 & {
+        m_tensor.data_type = detail::convert_from_cudnn_type(data_type);
+        return *this;
+    }
+    //! Set Dimensions of the tensor
+    auto
+    setDim(int64_t ndim, int64_t const *dim) -> TensorBuilder_v8 & {
+        std::copy((dim), dim + ndim, m_tensor.btensor_dimA);
+        m_tensor.nDims = ndim;
+        return *this;
+    }
+    //! Set Strides of the tensor
+    auto
+    setStride(int64_t ndim, int64_t const *strides) -> TensorBuilder_v8 & {
+        std::copy(strides, strides + ndim, m_tensor.btensor_strA);
+        return *this;
+    }
+    //! Set Unique Id  of the tensor
+    auto
+    setId(int64_t id_) -> TensorBuilder_v8 & {
+        m_tensor.id = id_;
+        return *this;
+    }
+    //! Set Alignment of the tensor
+    auto
+    setAlignment(int64_t alignment_) -> TensorBuilder_v8 & {
+        m_tensor.alignment = alignment_;
+        return *this;
+    }
+    //! Set isVirtual of the tensor
+    auto
+    setVirtual(bool virtual_ = true) -> TensorBuilder_v8 & {
+        m_tensor.isVirtual = virtual_;
+        return *this;
+    }
+    //! Set isByValue of the tensor
+    auto
+    setByValue(bool isByValue_ = true) -> TensorBuilder_v8 & {
+        m_tensor.isByValue = isByValue_;
+        return *this;
+    }
+    auto
+    setVectorCountAndDimension(int64_t vectorCount_, int64_t vectorDimension_) -> TensorBuilder_v8 & {
+        m_tensor.vectorCount     = vectorCount_;
+        m_tensor.vectorDimension = vectorDimension_;
+        return *this;
+    }
+
+    auto
+    setReorderType(cudnn_frontend::TensorReordering_t reordering_type) -> TensorBuilder_v8 & {
+        m_tensor.reorder_type = reordering_type;
+        return *this;
+    }
+
+    // To be deprecated. Please use setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t).
+    auto
+    setReorderType(cudnnBackendTensorReordering_t reordering_type) -> TensorBuilder_v8 & {
+        detail::convert_from_cudnn_type(reordering_type, m_tensor.reorder_type);
+        return *this;
+    }
+
+    /** @} */
+
+    // TODO: Deprecate in v1.0
+    auto
+    setStrides(int64_t ndim, int64_t const *strides) -> TensorBuilder_v8 & {
+        return setStride(ndim, strides);
+    }
+
+    auto
+    setRaggedOffset(std::shared_ptr<Tensor_v8> &raggedOffset) -> TensorBuilder_v8 & {
+        m_tensor.raggedOffset = raggedOffset;
+        return *this;
+    }
+
+    // Clone parameters of another tensor. Make sure to still set the UID since UID of two tensors shouldn't be the
+    // same.
+    auto
+    cloneFrom(Tensor_v8 const &from, int64_t newID) -> TensorBuilder_v8 & {
+        m_tensor.data_type = from.data_type;
+        m_tensor.nDims     = from.nDims;
+        m_tensor.id        = newID;
+        std::copy(from.getDimArray(), from.getDimArray() + m_tensor.nDims, m_tensor.btensor_dimA);
+        std::copy(from.getStrideArray(), from.getStrideArray() + m_tensor.nDims, m_tensor.btensor_strA);
+        m_tensor.alignment       = from.alignment;
+        m_tensor.isVirtual       = from.isVirtual;
+        m_tensor.isByValue       = from.isByValue;
+        m_tensor.vectorCount     = from.vectorCount;
+        m_tensor.vectorDimension = from.vectorDimension;
+        m_tensor.reorder_type    = from.reorder_type;
+        return *this;
+    }
+
+    //! constructs the Tensor_v8 by calling the cudnn API
+    //! Throws the appropriate error message
+    Tensor_v8 &&
+    build() {
+        // Sanity check if non-default fields have been set correctly.
+        if (m_tensor.alignment <= 0) {
+            set_error_and_throw_exception(
+                &m_tensor,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_TENSOR_DESCRIPTOR: Check and Set the CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT field");
+            return std::move(m_tensor);
+        }
+        if (m_tensor.id < 0) {
+            set_error_and_throw_exception(
+                &m_tensor,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_TENSOR_DESCRIPTOR: Check and Set the CUDNN_ATTR_TENSOR_UNIQUE_ID as a valid value");
+            return std::move(m_tensor);
+        }
+        if (m_tensor.btensor_strA[0] < 0) {
+            set_error_and_throw_exception(
+                &m_tensor,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_TENSOR_DESCRIPTOR: Check and Set the CUDNN_ATTR_TENSOR_STRIDES Correctly");
+            return std::move(m_tensor);
+        }
+        if (m_tensor.btensor_dimA[0] <= 0) {
+            set_error_and_throw_exception(
+                &m_tensor,
+                CUDNN_STATUS_BAD_PARAM,
+                "CUDNN_BACKEND_TENSOR_DESCRIPTOR: Check and Set the CUDNN_ATTR_TENSOR_DIMENSIONS Correctly");
+            return std::move(m_tensor);
+        }
+        if (m_tensor.pointer != nullptr) {
+            set_error_and_throw_exception(&m_tensor,
+                                          CUDNN_STATUS_BAD_PARAM,
+                                          "CUDNN_BACKEND_TENSOR_DESCRIPTOR: Bad tensor created. The tensor already "
+                                          "seems to be pointing to something");
+            return std::move(m_tensor);
+        }
+
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_tensor.initialize_managed_backend_pointer(CUDNN_BACKEND_TENSOR_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: cudnnCreate Descriptor Failed");
+            return std::move(m_tensor);
+        }
+
+        // Once Created lets set the descriptor parameters.
+        cudnnDataType_t cudnn_data_type;
+        status = detail::convert_to_cudnn_type(m_tensor.data_type, cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_DATA_TYPE Failed");
+            return std::move(m_tensor);
+        }
+        status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_TENSOR_DATA_TYPE,
+                                       CUDNN_TYPE_DATA_TYPE,
+                                       1,
+                                       &cudnn_data_type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_DATA_TYPE Failed");
+            return std::move(m_tensor);
+        }
+        status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_TENSOR_DIMENSIONS,
+                                       CUDNN_TYPE_INT64,
+                                       m_tensor.nDims,
+                                       m_tensor.btensor_dimA);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_DIMENSIONS Failed");
+            return std::move(m_tensor);
+        }
+        status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_TENSOR_STRIDES,
+                                       CUDNN_TYPE_INT64,
+                                       m_tensor.nDims,
+                                       m_tensor.btensor_strA);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_STRIDES Failed");
+            return std::move(m_tensor);
+        }
+        status = detail::set_attribute(
+            m_tensor.pointer->get_backend_descriptor(), CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &m_tensor.id);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_UNIQUE_ID Failed");
+            return std::move(m_tensor);
+        }
+        status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
+                                       CUDNN_TYPE_INT64,
+                                       1,
+                                       &m_tensor.alignment);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_tensor,
+                status,
+                "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT Failed");
+            return std::move(m_tensor);
+        }
+        if (m_tensor.isVirtual) {
+            status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_IS_VIRTUAL,
+                                           CUDNN_TYPE_BOOLEAN,
+                                           1,
+                                           &m_tensor.isVirtual);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT Failed");
+                return std::move(m_tensor);
+            }
+        }
+        if (m_tensor.isByValue) {
+            status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_IS_BY_VALUE,
+                                           CUDNN_TYPE_BOOLEAN,
+                                           1,
+                                           &m_tensor.isByValue);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_IS_BY_VALUE Failed");
+                return std::move(m_tensor);
+            }
+        }
+
+        if (m_tensor.vectorCount > 1) {
+            status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_VECTOR_COUNT,
+                                           CUDNN_TYPE_INT64,
+                                           1,
+                                           &m_tensor.vectorCount);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_VECTOR_COUNT Failed");
+                return std::move(m_tensor);
+            }
+        }
+        if (m_tensor.vectorDimension >= 0) {
+            status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION,
+                                           CUDNN_TYPE_INT64,
+                                           1,
+                                           &m_tensor.vectorDimension);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION Failed");
+                return std::move(m_tensor);
+            }
+        }
+
+        // Set ragged offset descriptor
+#if (CUDNN_VERSION >= 8900)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(8900,
+                                                     m_tensor,
+                                                     "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute "
+                                                     "CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC requires cudnn version 8.9");
+        if (m_tensor.raggedOffset != nullptr) {
+            std::vector<cudnnBackendDescriptor_t> backendRaggedOffset;
+            backendRaggedOffset.push_back(m_tensor.raggedOffset.get()->pointer->get_backend_descriptor());
+            status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC,
+                                           CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                           static_cast<int64_t>(backendRaggedOffset.size()),
+                                           backendRaggedOffset.data());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC Failed");
+                return std::move(m_tensor);
+            }
+        }
+#endif
+
+        // Set the reorder_type
+        if (m_tensor.reorder_type != cudnn_frontend::TensorReordering_t::NONE) {
+            cudnnBackendTensorReordering_t cudnn_reordering_type;
+            status = detail::convert_to_cudnn_type(m_tensor.reorder_type, cudnn_reordering_type);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_REORDERING_MODE Failed");
+                return std::move(m_tensor);
+            }
+            status = detail::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                           CUDNN_ATTR_TENSOR_REORDERING_MODE,
+                                           CUDNN_TYPE_TENSOR_REORDERING_MODE,
+                                           1,
+                                           &m_tensor.reorder_type);
+            if (status != CUDNN_STATUS_SUCCESS) {
+                set_error_and_throw_exception(
+                    &m_tensor,
+                    status,
+                    "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_REORDERING_MODE Failed");
+                return std::move(m_tensor);
+            }
+        }
+        // Finalizing the descriptor
+        status = detail::finalize(m_tensor.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(&m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR cudnnFinalize failed");
+            return std::move(m_tensor);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL(m_tensor);
+        return std::move(m_tensor);
+    }
+
+    explicit TensorBuilder_v8()                = default;
+    ~TensorBuilder_v8()                        = default;
+    TensorBuilder_v8(TensorBuilder_v8 &&)      = delete;
+    TensorBuilder_v8(TensorBuilder_v8 const &) = delete;
+    TensorBuilder_v8 &
+    operator=(TensorBuilder_v8 const &) = delete;
+
+   private:
+    Tensor_v8 m_tensor;  //! Tensor built by the TensorBuilder class.
+};
+
+using Tensor        = Tensor_v8;
+using TensorBuilder = TensorBuilder_v8;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_VariantPack.h b/third_party/cudnn-frontend/include/cudnn_frontend_VariantPack.h
new file mode 100644
index 00000000..3b7ae8ef
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_VariantPack.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <utility>
+
+#include "cudnn_frontend_utils.h"
+
+namespace cudnn_frontend {
+
+///
+/// VariantPack_v8 Class
+/// This class tells the Configuration of the Engine in terms of the knob
+/// choices
+/// Properties:
+///    - num knobs
+///    - Choice
+///    - Engine
+///
+/// Use VariantPackBuilder_v8 to build this class.
+/// Describe returns a string describing the tensor class
+///
+class VariantPack_v8 : public BackendDescriptor {
+   public:
+    friend class VariantPackBuilder_v8;
+    std::string
+    describe() const override {
+        std::stringstream ss;
+        ss << "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR :" << " has " << num_ptrs << " data pointers";
+        return ss.str();
+    }
+
+    VariantPack_v8(VariantPack_v8 &&from) = default;
+    VariantPack_v8 &
+    operator=(VariantPack_v8 &&from) = default;
+
+    ~VariantPack_v8() = default;
+
+   private:
+    VariantPack_v8()                       = default;
+    VariantPack_v8(VariantPack_v8 const &) = delete;
+    VariantPack_v8 &
+    operator=(VariantPack_v8 const &) = delete;
+
+    void *workspace = nullptr;
+    std::vector<void *> data_pointers;
+    std::vector<int64_t> uid;
+    int64_t num_ptrs = -1;
+};
+
+///
+/// VariantPackBuilder_v8 Class
+/// Helper class used to build VariantPack_v8 class
+class VariantPackBuilder_v8 {
+   public:
+    /** @defgroup VariantPackBuilder_v8
+     *  Set individual property of VariantPack_v8 class
+     *  @{
+     */
+    //! Set dataPointers for the VariantPack_v8
+    auto
+    setDataPointers(int64_t num_ptr, void **ptrs) -> VariantPackBuilder_v8 & {
+        m_variant_pack.data_pointers.reserve(static_cast<size_t>(num_ptr));
+        std::copy(ptrs, ptrs + num_ptr, std::back_inserter(m_variant_pack.data_pointers));
+        m_variant_pack.num_ptrs = num_ptr;
+        return *this;
+    }
+    //! Set Uids for the VariantPack_v8
+    auto
+    setUids(int64_t num_uids, const int64_t *uid) -> VariantPackBuilder_v8 & {
+        return setUids(num_uids, const_cast<int64_t *>(uid));
+    }
+
+    auto
+    setUids(int64_t num_uids, int64_t *uid) -> VariantPackBuilder_v8 & {
+        m_variant_pack.uid.reserve(static_cast<size_t>(num_uids));
+        std::copy(uid, uid + num_uids, std::back_inserter(m_variant_pack.uid));
+        return *this;
+    }
+    //! Initialize a set of pairs containing uid and data pointer.
+    auto
+    setDataPointers(std::set<std::pair<uint64_t, void *>> const &data_pointers) -> VariantPackBuilder_v8 & {
+        m_variant_pack.num_ptrs = data_pointers.size();
+        m_variant_pack.uid.reserve(static_cast<size_t>(m_variant_pack.num_ptrs));
+        m_variant_pack.data_pointers.reserve(static_cast<size_t>(m_variant_pack.num_ptrs));
+        for (auto &data_pointer : data_pointers) {
+            m_variant_pack.uid.push_back(data_pointer.first);
+            m_variant_pack.data_pointers.push_back(data_pointer.second);
+        }
+        return *this;
+    }
+    //! Set Workspace
+    auto
+    setWorkspacePointer(void *ws) -> VariantPackBuilder_v8 & {
+        m_variant_pack.workspace = ws;
+        return *this;
+    }
+    /** @} */
+
+    //! constructs the Engine Config by calling the cudnn API
+    //! Throws the appropriate error message
+    VariantPack_v8 &&
+    build() {
+        // Create a descriptor. Memory allocation happens here.
+        auto status = m_variant_pack.initialize_managed_backend_pointer(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_variant_pack, status, "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: cudnnCreate Failed");
+            return std::move(m_variant_pack);
+        }
+
+        status = detail::set_attribute(m_variant_pack.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
+                                       CUDNN_TYPE_VOID_PTR,
+                                       m_variant_pack.num_ptrs,
+                                       m_variant_pack.data_pointers.data());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_variant_pack,
+                status,
+                "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: SetAttribute CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS Failed");
+            return std::move(m_variant_pack);
+        }
+
+        status = detail::set_attribute(m_variant_pack.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
+                                       CUDNN_TYPE_INT64,
+                                       m_variant_pack.num_ptrs,
+                                       m_variant_pack.uid.data());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_variant_pack,
+                status,
+                "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: SetAttribute CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS Failed");
+            return std::move(m_variant_pack);
+        }
+
+        status = detail::set_attribute(m_variant_pack.pointer->get_backend_descriptor(),
+                                       CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
+                                       CUDNN_TYPE_VOID_PTR,
+                                       1,
+                                       &m_variant_pack.workspace);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_variant_pack,
+                status,
+                "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: SetAttribute CUDNN_ATTR_VARIANT_PACK_WORKSPACE Failed");
+            return std::move(m_variant_pack);
+        }
+
+        // Finalizing the descriptor
+        status = detail::finalize(m_variant_pack.pointer->get_backend_descriptor());
+        if (status != CUDNN_STATUS_SUCCESS) {
+            set_error_and_throw_exception(
+                &m_variant_pack, status, "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: cudnnFinalize Failed");
+            return std::move(m_variant_pack);
+        }
+        CUDNN_FE_LOG_LABEL_ENDL(m_variant_pack);
+        return std::move(m_variant_pack);
+    }
+
+    explicit VariantPackBuilder_v8()                     = default;
+    ~VariantPackBuilder_v8()                             = default;
+    VariantPackBuilder_v8(VariantPackBuilder_v8 &&)      = delete;
+    VariantPackBuilder_v8(VariantPackBuilder_v8 const &) = delete;
+    VariantPackBuilder_v8 &
+    operator=(VariantPackBuilder_v8 const &) = delete;
+
+   private:
+    VariantPack_v8 m_variant_pack;
+};
+
+using VariantPack        = VariantPack_v8;
+using VariantPackBuilder = VariantPackBuilder_v8;
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_find_plan.h b/third_party/cudnn-frontend/include/cudnn_frontend_find_plan.h
new file mode 100644
index 00000000..b9079038
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_find_plan.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iomanip>
+#include <set>
+#include "cudnn_frontend_EngineConfigGenerator.h"
+
+namespace cudnn_frontend {
+
+/// Sorts the execution plans by their run time.
+/// The run time of plan may not trivial and hence we
+/// run it multiple times till we get a stable value.
+/// We have an additional dry-run which helps stabilize the
+/// time further.
+template <CudnnFindSamplingTechnique samplingTechnique>
+auto
+time_sorted_plan(cudnnHandle_t handle,
+                 executionPlans_t plans,
+                 VariantPack const &variantPack,
+                 uint64_t max_plans_to_evaluate = 1000) -> executionPlans_t {
+    executionPlans_t time_sorted_plans;
+
+    auto plan_cmp = [](const ExecutionPlan &a, const ExecutionPlan &b) {
+        return a.getExecutionTime() < b.getExecutionTime();
+    };
+    std::set<std::reference_wrapper<ExecutionPlan>, decltype(plan_cmp)> timed_execution_plans(plan_cmp);
+
+    const int maxIterCount         = (samplingTechnique == CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE) ? 1
+                                     : (samplingTechnique == CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE)
+                                         ? 3
+                                         : 100;
+    const float threshhold         = 0.95f;
+    uint64_t successful_plan_count = 0;
+    cudaEvent_t start, stop;
+    detail::cuda_event_create(&start);
+    detail::cuda_event_create(&stop);
+    detail::cuda_device_synchronize();
+
+    cudaStream_t stream = nullptr;
+    detail::get_stream(handle, &stream);
+
+    for (auto &plan : plans) {
+        float time_ms       = 0.0f;
+        float final_time_ms = 0.0f;
+        float min_time_ms   = std::numeric_limits<float>::max();
+
+        // Warm-up run
+        auto warmup_status = detail::execute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (warmup_status != CUDNN_STATUS_SUCCESS) {
+            CUDNN_FE_LOG_LABEL_ENDL("Plan " << plan.getTag() << " failed with " << to_string(warmup_status));
+            continue;
+        }
+        successful_plan_count++;
+        detail::cuda_device_synchronize();
+
+        float time_run_ms[3] = {0.0f, 0.0f, 0.0f};
+        for (int i = 0; i < maxIterCount; i++) {
+            detail::cuda_event_record(start, stream);
+
+            detail::execute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+
+            detail::cuda_event_record(stop, stream);
+            detail::cuda_event_synchronize(stop);
+            detail::cuda_event_elapsed_time(&time_ms, start, stop);
+
+            if constexpr (samplingTechnique == CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_TILL_STABLE) {
+                final_time_ms = std::min(min_time_ms, time_ms);
+                if (time_ms / min_time_ms < threshhold) {
+                    min_time_ms = final_time_ms;
+                } else {
+                    break;
+                }
+            } else if constexpr (samplingTechnique == CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE) {
+                time_run_ms[i] = time_ms;
+                if (i == maxIterCount - 1) {
+                    auto min_of_first_two = std::min(time_run_ms[0], time_run_ms[1]);
+                    auto max_of_first_two = std::max(time_run_ms[0], time_run_ms[1]);
+                    final_time_ms         = std::max(min_of_first_two, std::min(max_of_first_two, time_run_ms[2]));
+                }
+            } else {
+                final_time_ms = time_ms;
+            }
+        }
+        CUDNN_FE_LOG_LABEL_ENDL("Plan " << plan.getTag() << " took " << std::setw(10) << final_time_ms);
+        plan.setExecutionTime(final_time_ms);
+        timed_execution_plans.insert(plan);
+        if (successful_plan_count >= max_plans_to_evaluate) {
+            CUDNN_FE_LOG_LABEL_ENDL("Successfully profiled " << max_plans_to_evaluate << "plans.");
+            break;
+        }
+    }
+
+    for (ExecutionPlan &plan : timed_execution_plans) {
+        time_sorted_plans.emplace_back(std::move(plan));
+    }
+
+    detail::cuda_event_destroy(start);
+    detail::cuda_event_destroy(stop);
+
+    CUDNN_FE_LOG_LABEL_ENDL("Auto-tuning returns " << time_sorted_plans.size() << " plans.");
+
+    return time_sorted_plans;
+}
+
+template <CudnnFindSamplingTechnique samplingTechnique>
+auto
+EngineConfigGenerator::cudnnFindPlan(cudnnHandle_t handle,
+                                     cudnn_frontend::OperationGraph &opGraph,
+                                     cudnn_frontend::VariantPack const &variantPack) -> executionPlans_t {
+    /// Creating a set of execution plans that are supported.
+    executionPlans_t plans = cudnnGetPlan(handle, opGraph);
+    return time_sorted_plan<samplingTechnique>(handle, std::move(plans), variantPack);
+}
+
+template <CudnnFindSamplingTechnique samplingTechnique>
+auto
+EngineConfigGenerator::cudnnFindPlan(cudnnHandle_t handle,
+                                     cudnn_frontend::OperationGraph &opGraph,
+                                     cudnn_frontend::VariantPack const &variantPack,
+                                     Predicate pred) -> executionPlans_t {
+    /// Creating a set of execution plans that are supported.
+    executionPlans_t plans = cudnnGetPlan(handle, opGraph, pred);
+    return time_sorted_plan<samplingTechnique>(handle, std::move(plans), variantPack);
+}
+
+template <CudnnFindSamplingTechnique samplingTechnique>
+auto
+EngineConfigGenerator::cudnnFindPlanAndCache(cudnnHandle_t handle,
+                                             cudnn_frontend::OperationGraph &opGraph,
+                                             cudnn_frontend::VariantPack const &variantPack,
+                                             cudnn_frontend::ExecutionPlanCache &cache,
+                                             Predicate pred) -> cudnn_frontend::ExecutionPlan {
+    /// Creating a set of execution plans that are supported.
+    auto sorted_plans = cudnnFindPlan<samplingTechnique>(handle, opGraph, variantPack, pred);
+    /// Check if the fastest plan is stable enough to be added to the plan cache
+    if (cache.is_fastest_plan_stable(opGraph, sorted_plans.front().getTag())) {
+        cache.add_plan_to_cache(opGraph, sorted_plans.front());
+    }
+    return sorted_plans.front();
+}
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_get_plan.h b/third_party/cudnn-frontend/include/cudnn_frontend_get_plan.h
new file mode 100644
index 00000000..1cb4c412
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_get_plan.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "cudnn_frontend_EngineConfigGenerator.h"
+
+namespace cudnn_frontend {
+
+auto inline EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph& opGraph, size_t max_plans)
+    -> executionPlans_t {
+    // Creating a set of execution plans that are supported.
+    executionPlans_t plans;
+    for (auto& engine_config : generate_engine_config(opGraph)) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        try {
+#endif
+            plans.push_back(
+                ExecutionPlanBuilder().setHandle(handle).setEngineConfig(engine_config, opGraph.getTag()).build());
+            CUDNN_FE_LOG_LABEL_ENDL("Added plan " << plans.back().getTag() << " "
+                                                  << to_string(plans.back().get_status()));
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        } catch (cudnnException& e) {
+            CUDNN_FRONTEND_UNUSED(e);
+            continue;
+        }
+#endif
+        if (plans.size() >= max_plans) {
+            break;
+        }
+    }
+    return plans;
+}
+
+auto inline EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle,
+                                                OperationGraph& opGraph,
+                                                Predicate pred,
+                                                size_t max_plans) -> executionPlans_t {
+    // Creating a set of execution plans that are supported.
+    executionPlans_t plans = cudnnGetPlan(handle, opGraph, max_plans);
+    return filter(pred, plans);
+}
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_shim.h b/third_party/cudnn-frontend/include/cudnn_frontend_shim.h
new file mode 100644
index 00000000..f2101c76
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_shim.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h>
+#define dlerror() static_cast<UINT_PTR>(GetLastError())
+#define dlopen(x, y) LoadLibrary(x)
+#define dlsym(x, y) GetProcAddress(x, y)
+#define dlclose(x) FreeLibrary(x)
+#else
+#include <dlfcn.h>
+#define HMODULE void *
+#endif
+#include <mutex>
+#include <stdexcept>
+#endif
+
+namespace cudnn_frontend {
+
+// cudnn package initialization set this global handle
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#ifdef _WIN32
+extern HMODULE cudnn_dlhandle;
+#else
+extern void *cudnn_dlhandle;
+#endif
+#endif
+
+namespace detail {
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+
+inline auto
+get_symbol(const char *function_name) {
+    auto ret = dlsym(cudnn_dlhandle, function_name);
+    return ret;
+}
+
+enum class CudaLibrary { CUDART, CUDA };
+
+inline HMODULE
+load_cuda_so() {
+    // Clear any existing error
+    dlerror();
+
+    // Attempt to open the cuda library
+    HMODULE handle    = dlopen("libcuda.so", RTLD_NOW);
+    const char *error = reinterpret_cast<const char *>(dlerror());
+    if (!handle || error) {
+        // If opening the library fails, throw an exception with the error message
+        throw std::runtime_error("Unable to dlopen libcuda.so : " + std::string(error ? error : "Unknown error"));
+    }
+
+    return handle;
+}
+
+inline HMODULE
+load_cudart_so() {
+    // Clear any existing error
+    dlerror();
+
+    // List of potential libcudart libraries (Adding major version to support python package)
+    constexpr const char *libs[] = {"libcudart.so.12", "libcudart.so.13"};
+    constexpr size_t num_libs    = sizeof(libs) / sizeof(libs[0]);
+
+    HMODULE lib_handle = nullptr;
+    int loaded_index   = -1;
+
+    for (size_t i = 0; i < num_libs; ++i) {
+        HMODULE handle    = dlopen(libs[i], RTLD_NOW);
+        const char *error = reinterpret_cast<const char *>(dlerror());
+
+        if (handle && !error) {
+            if (lib_handle) {
+                // Already loaded one -> multiple found
+                dlclose(handle);
+                throw std::runtime_error("Multiple libcudart libraries found: " + std::string(libs[loaded_index]) +
+                                         " and " + std::string(libs[i]));
+            }
+            lib_handle   = handle;
+            loaded_index = static_cast<int>(i);
+        }
+    }
+
+    // If opening the library fails, throw an exception with the error message
+    if (!lib_handle) {
+        throw std::runtime_error("Unable to load any libcudart.so.* library.");
+    }
+
+    return lib_handle;
+}
+
+inline void *
+get_cuda_symbol(CudaLibrary library, const char *function_name) {
+    // Static mutex to ensure thread-safety
+    (void)function_name;
+    static std::mutex cuda_lib_mutex;
+    // Static map to store handles for different libraries
+    static std::unordered_map<CudaLibrary, HMODULE> dl_handles;
+
+    // Lock the mutex to ensure thread-safe access
+    std::lock_guard<std::mutex> lock(cuda_lib_mutex);
+
+    // If the library hasn't been opened yet, open it and store the handle to future use
+    if (dl_handles.find(library) == dl_handles.end()) {
+        dl_handles[library] = (library == CudaLibrary::CUDART) ? load_cudart_so() : load_cuda_so();
+    }
+
+    // Clear any existing error before calling dlsym
+    dlerror();
+
+    // Try to find the symbol (function) in the library
+    void *symbol      = dlsym(dl_handles[library], function_name);
+    const char *error = reinterpret_cast<const char *>(dlerror());
+    if (!symbol || error) {
+        // If the symbol is not found, throw an exception with details
+        throw std::runtime_error("Unable to find symbol " + std::string(function_name) + ": " +
+                                 std::string(error ? error : "Unknown error"));
+    }
+
+    // Return the pointer to the function
+    return symbol;
+}
+
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(MINIMUM_VERSION, DESCRIPTOR, MESSAGE) \
+    if (MINIMUM_VERSION > detail::get_backend_version()) {                                 \
+        set_error_and_throw_exception(&DESCRIPTOR, CUDNN_STATUS_INVALID_VALUE, MESSAGE);   \
+        return std::move(DESCRIPTOR);                                                      \
+    }
+#else
+#define NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(MINIMUM_VERSION, DESCRIPTOR, MESSAGE)
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(MINIMUM_VERSION, STATUS) \
+    if (MINIMUM_VERSION > detail::get_backend_version()) {                       \
+        return STATUS;                                                           \
+    }
+#else
+#define NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(MINIMUM_VERSION, STATUS)
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_FE_CALL_TO_BACKEND(function_name, backend_symbol, ...)           \
+    static void *fptr = get_symbol(#backend_symbol);                        \
+    if (fptr == nullptr) {                                                  \
+        throw std::runtime_error("Unable to find symbol " #backend_symbol); \
+    }                                                                       \
+    return reinterpret_cast<decltype(function_name) *>(fptr)(__VA_ARGS__);
+#else
+#define NV_FE_CALL_TO_BACKEND(function_name, backend_symbol, ...) return backend_symbol(__VA_ARGS__);
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+
+#define NV_FE_CALL_TO_CUDA(function_name, cuda_symbol, ...) \
+    return reinterpret_cast<decltype(function_name) *>(get_cuda_symbol(CudaLibrary::CUDART, #cuda_symbol))(__VA_ARGS__);
+#define NV_FE_CALL_TO_CU(function_name, cuda_symbol, ...) \
+    return reinterpret_cast<decltype(function_name) *>(get_cuda_symbol(CudaLibrary::CUDA, #cuda_symbol))(__VA_ARGS__);
+
+#else
+
+#define NV_FE_CALL_TO_CUDA(function_name, cuda_symbol, ...) return cuda_symbol(__VA_ARGS__);
+#define NV_FE_CALL_TO_CU(function_name, cuda_symbol, ...) return cuda_symbol(__VA_ARGS__);
+
+#endif
+
+inline CUresult
+cu_graph_create(CUgraph *pGraph, unsigned int flags) {
+    NV_FE_CALL_TO_CU(cu_graph_create, cuGraphCreate, pGraph, flags);
+}
+
+inline CUresult
+cu_graph_get_nodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes) {
+    NV_FE_CALL_TO_CU(cu_graph_get_nodes, cuGraphGetNodes, hGraph, nodes, numNodes);
+}
+
+inline cudaError_t
+cuda_graph_add_child_graph_node(cudaGraphNode_t *pGraphNode,
+                                cudaGraph_t graph,
+                                const cudaGraphNode_t *pDependencies,
+                                size_t numDependencies,
+                                cudaGraph_t childGraph) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_add_child_graph_node,
+                       cudaGraphAddChildGraphNode,
+                       pGraphNode,
+                       graph,
+                       pDependencies,
+                       numDependencies,
+                       childGraph);
+}
+
+inline cudaError_t
+cuda_graph_add_memcpy_node_1D(cudaGraphNode_t *pGraphNode,
+                              cudaGraph_t graph,
+                              const cudaGraphNode_t *pDependencies,
+                              size_t numDependencies,
+                              void *dst,
+                              const void *src,
+                              size_t count,
+                              cudaMemcpyKind kind) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_add_memcpy_node_1D,
+                       cudaGraphAddMemcpyNode1D,
+                       pGraphNode,
+                       graph,
+                       pDependencies,
+                       numDependencies,
+                       dst,
+                       src,
+                       count,
+                       kind);
+}
+
+inline cudaError_t
+cuda_graph_add_memset_node(cudaGraphNode_t *pGraphNode,
+                           cudaGraph_t graph,
+                           const cudaGraphNode_t *pDependencies,
+                           size_t numDependencies,
+                           const cudaMemsetParams *pMemsetParams) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_add_memset_node,
+                       cudaGraphAddMemsetNode,
+                       pGraphNode,
+                       graph,
+                       pDependencies,
+                       numDependencies,
+                       pMemsetParams);
+}
+
+inline cudaError_t
+cuda_graph_get_root_nodes(cudaGraph_t hGraph, cudaGraphNode_t *phNodes, size_t *pNumNodes) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_get_root_nodes, cudaGraphGetRootNodes, hGraph, phNodes, pNumNodes);
+}
+
+inline cudaError_t
+cuda_graph_child_graph_node_get_graph(cudaGraphNode_t hNode, cudaGraph_t *phGraph) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_child_graph_node_get_graph, cudaGraphChildGraphNodeGetGraph, hNode, phGraph);
+}
+
+// 4-parameter shim for cudaGraphNodeGetDependentNodes.
+// The underlying CUDA feature was introduced in CUDA 12.3, but
+// for simplicity we support this in 13.0+ only.
+#if (CUDART_VERSION >= 13000)
+inline cudaError_t
+cuda_graph_node_get_dependent_nodes_v2(cudaGraphNode_t node,
+                                       cudaGraphNode_t *pDependentNodes,
+                                       cudaGraphEdgeData *edgeData,
+                                       size_t *pNumDependentNodes) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_node_get_dependent_nodes_v2,
+                       cudaGraphNodeGetDependentNodes,
+                       node,
+                       pDependentNodes,
+                       edgeData,
+                       pNumDependentNodes);
+}
+#endif
+
+// 3-parameter shim for cudaGraphNodeGetDependentNodes.
+inline cudaError_t
+cuda_graph_node_get_dependent_nodes(cudaGraphNode_t node,
+                                    cudaGraphNode_t *pDependentNodes,
+                                    size_t *pNumDependentNodes) {
+#if (CUDART_VERSION >= 13000)
+    // The 3-parameter version of cudaGraphNodeGetDependentNodes was removed in CUDA 13.0,
+    // so call the other shim.
+    return cuda_graph_node_get_dependent_nodes_v2(node, pDependentNodes, /*edgeData=*/nullptr, pNumDependentNodes);
+#else
+    NV_FE_CALL_TO_CUDA(
+        cuda_graph_node_get_dependent_nodes, cudaGraphNodeGetDependentNodes, node, pDependentNodes, pNumDependentNodes);
+#endif
+}
+
+inline cudaError_t
+cuda_graph_add_memcpy_node_set_params_1D(cudaGraphNode_t node,
+                                         void *dst,
+                                         const void *src,
+                                         size_t count,
+                                         cudaMemcpyKind kind) {
+    NV_FE_CALL_TO_CUDA(
+        cuda_graph_add_memcpy_node_set_params_1D, cudaGraphMemcpyNodeSetParams1D, node, dst, src, count, kind);
+}
+
+inline cudaError_t
+cuda_graph_add_memset_node_set_params(cudaGraphNode_t node, const cudaMemsetParams *pMemsetParams) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_add_memset_node_set_params, cudaGraphMemsetNodeSetParams, node, pMemsetParams);
+}
+
+inline cudaError_t
+cuda_graph_begin_capture(cudaStream_t stream, cudaStreamCaptureMode mode) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_begin_capture, cudaStreamBeginCapture, stream, mode);
+}
+
+inline cudaError_t
+cuda_stream_is_capturing(cudaStream_t stream, cudaStreamCaptureStatus *capture_status) {
+    NV_FE_CALL_TO_CUDA(cuda_stream_is_capturing, cudaStreamIsCapturing, stream, capture_status);
+}
+
+inline cudaError_t
+cuda_stream_create(cudaStream_t *stream) {
+    NV_FE_CALL_TO_CUDA(cuda_stream_create, cudaStreamCreate, stream);
+}
+
+inline cudaError_t
+cuda_stream_destroy(cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_stream_destroy, cudaStreamDestroy, stream);
+}
+
+inline cudaError_t
+cuda_graph_end_capture(cudaStream_t stream, cudaGraph_t *graph) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_end_capture, cudaStreamEndCapture, stream, graph);
+}
+
+inline cudaError_t
+cuda_graph_destroy(cudaGraph_t graph) {
+    NV_FE_CALL_TO_CUDA(cuda_graph_destroy, cudaGraphDestroy, graph);
+}
+
+inline cudaError_t
+cuda_event_create(cudaEvent_t *event) {
+    NV_FE_CALL_TO_CUDA(cuda_event_create, cudaEventCreate, event);
+}
+
+inline cudaError_t
+cuda_event_destroy(cudaEvent_t event) {
+    NV_FE_CALL_TO_CUDA(cuda_event_destroy, cudaEventDestroy, event);
+}
+
+inline cudaError_t
+cuda_event_record(cudaEvent_t event, cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_event_record, cudaEventRecord, event, stream);
+}
+
+inline cudaError_t
+cuda_event_synchronize(cudaEvent_t event) {
+    NV_FE_CALL_TO_CUDA(cuda_event_synchronize, cudaEventSynchronize, event);
+}
+
+inline cudaError_t
+cuda_event_elapsed_time(float *ms, cudaEvent_t start, cudaEvent_t end) {
+    NV_FE_CALL_TO_CUDA(cuda_event_elapsed_time, cudaEventElapsedTime, ms, start, end);
+}
+
+inline cudaError_t
+cuda_mem_cpy_async(void *dst, const void *src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_mem_cpy_async, cudaMemcpyAsync, dst, src, count, kind, stream);
+}
+
+inline cudaError_t
+cuda_mem_set_async(void *devPtr, int value, size_t count, cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_mem_set_async, cudaMemsetAsync, devPtr, value, count, stream);
+}
+
+inline cudaError_t
+cuda_get_device_properties(cudaDeviceProp *prop, int device) {
+    NV_FE_CALL_TO_CUDA(cuda_get_device_properties, cudaGetDeviceProperties, prop, device);
+}
+
+inline cudaError_t
+cuda_get_device(int *device) {
+    NV_FE_CALL_TO_CUDA(cuda_get_device, cudaGetDevice, device);
+}
+
+inline cudaError_t
+cuda_pointer_get_attributes(cudaPointerAttributes *attributes, const void *ptr) {
+    NV_FE_CALL_TO_CUDA(cuda_pointer_get_attributes, cudaPointerGetAttributes, attributes, ptr);
+}
+
+inline const char *
+cuda_get_error_string(cudaError_t error) {
+    NV_FE_CALL_TO_CUDA(cuda_get_error_string, cudaGetErrorString, error);
+}
+
+inline CUresult
+cu_get_error_string(CUresult error, const char **pStr) {
+    NV_FE_CALL_TO_CU(cu_get_error_string, cuGetErrorString, error, pStr);
+}
+
+inline cudaError_t
+cuda_device_synchronize() {
+    NV_FE_CALL_TO_CUDA(cuda_device_synchronize, cudaDeviceSynchronize);
+}
+
+inline cudaError_t
+cuda_stream_synchronize(cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_stream_synchronize, cudaStreamSynchronize, stream);
+}
+
+inline cudaError_t
+cuda_malloc(void **ptr, size_t sz) {
+    NV_FE_CALL_TO_CUDA(cuda_malloc, cudaMalloc, ptr, sz);
+}
+
+inline cudaError_t
+cuda_free(void *ptr) {
+    NV_FE_CALL_TO_CUDA(cuda_free, cudaFree, ptr);
+}
+
+inline cudnnStatus_t
+create_handle(cudnnHandle_t *handle) {
+    NV_FE_CALL_TO_BACKEND(create_handle, cudnnCreate, handle);
+}
+
+inline cudnnStatus_t
+destroy_handle(cudnnHandle_t handle) {
+    NV_FE_CALL_TO_BACKEND(destroy_handle, cudnnDestroy, handle);
+}
+
+inline size_t
+get_backend_version(void) {
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+    static void *fptr = get_symbol("cudnnGetVersion");
+    if (fptr == nullptr) {
+        throw std::runtime_error("Unable to find symbol cudnnGetVersion");
+    }
+    return reinterpret_cast<decltype(get_backend_version) *>(fptr)();
+#else
+    return cudnnGetVersion();
+#endif
+}
+
+inline constexpr size_t
+get_compiled_version(void) {
+    return CUDNN_VERSION;
+}
+
+inline std::string
+convert_version_to_str(size_t const version) {
+    // The multiplier for major version pre-v9 and post-v9 are different.
+    size_t major = version / 10000;
+    size_t minor = (version / 100) % 100;
+    if (major == 0) {
+        major = version / 1000;
+        minor = (version / 100) % 10;
+    }
+    auto patch = version % 100;
+
+    return std::to_string(major) + "." + std::to_string(minor) + "." + std::to_string(patch);
+}
+
+inline std::string
+get_backend_version_string() {
+    return convert_version_to_str(get_backend_version());
+}
+
+inline cudnnStatus_t
+create_descriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor) {
+    NV_FE_CALL_TO_BACKEND(create_descriptor, cudnnBackendCreateDescriptor, descriptorType, descriptor);
+}
+
+inline cudnnStatus_t
+destroy_descriptor(cudnnBackendDescriptor_t descriptor) {
+    NV_FE_CALL_TO_BACKEND(destroy_descriptor, cudnnBackendDestroyDescriptor, descriptor);
+}
+
+inline cudnnStatus_t
+set_attribute(cudnnBackendDescriptor_t descriptor,
+              cudnnBackendAttributeName_t attributeName,
+              cudnnBackendAttributeType_t attributeType,
+              int64_t elementCount,
+              const void *arrayOfElements) {
+    NV_FE_CALL_TO_BACKEND(set_attribute,
+                          cudnnBackendSetAttribute,
+                          descriptor,
+                          attributeName,
+                          attributeType,
+                          elementCount,
+                          arrayOfElements);
+}
+
+inline cudnnStatus_t
+get_attribute(cudnnBackendDescriptor_t const descriptor,
+              cudnnBackendAttributeName_t attributeName,
+              cudnnBackendAttributeType_t attributeType,
+              int64_t requestedElementCount,
+              int64_t *elementCount,
+              void *arrayOfElements) {
+    NV_FE_CALL_TO_BACKEND(get_attribute,
+                          cudnnBackendGetAttribute,
+                          descriptor,
+                          attributeName,
+                          attributeType,
+                          requestedElementCount,
+                          elementCount,
+                          arrayOfElements)
+}
+
+inline cudnnStatus_t
+finalize(cudnnBackendDescriptor_t descriptor) {
+    NV_FE_CALL_TO_BACKEND(finalize, cudnnBackendFinalize, descriptor);
+}
+
+inline cudnnStatus_t
+execute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack) {
+    NV_FE_CALL_TO_BACKEND(execute, cudnnBackendExecute, handle, executionPlan, variantPack);
+}
+
+inline cudnnStatus_t
+populate_cuda_graph(cudnnHandle_t handle,
+                    cudnnBackendDescriptor_t executionPlan,
+                    cudnnBackendDescriptor_t variantPack,
+                    cudaGraph_t cuda_graph) {
+#if CUDNN_VERSION >= 90500
+    NV_FE_CALL_TO_BACKEND(
+        populate_cuda_graph, cudnnBackendPopulateCudaGraph, handle, executionPlan, variantPack, cuda_graph);
+#else
+    (void)handle;
+    (void)executionPlan;
+    (void)variantPack;
+    (void)cuda_graph;
+    return CUDNN_STATUS_VERSION_MISMATCH;
+#endif
+}
+
+inline cudnnStatus_t
+update_cuda_graph(cudnnHandle_t handle,
+                  cudnnBackendDescriptor_t executionPlan,
+                  cudnnBackendDescriptor_t variantPack,
+                  cudaGraph_t cuda_graph) {
+#if CUDNN_VERSION >= 90500
+    NV_FE_CALL_TO_BACKEND(
+        update_cuda_graph, cudnnBackendUpdateCudaGraph, handle, executionPlan, variantPack, cuda_graph);
+#else
+    (void)handle;
+    (void)executionPlan;
+    (void)variantPack;
+    (void)cuda_graph;
+    return CUDNN_STATUS_VERSION_MISMATCH;
+#endif
+}
+
+inline const char *
+get_error_string(cudnnStatus_t status) {
+    NV_FE_CALL_TO_BACKEND(get_error_string, cudnnGetErrorString, status);
+}
+
+inline void
+get_last_error_string(char *message, size_t size) {
+    if (detail::get_backend_version() >= 90000 && detail::get_compiled_version() >= 90000) {
+#if CUDNN_VERSION >= 90000
+        NV_FE_CALL_TO_BACKEND(get_last_error_string, cudnnGetLastErrorString, message, size);
+#endif
+    } else {
+        std::string default_message = "Can't retrieve backend error messages for CUDNN version < 9.0";
+        // strncpy(message, default_message.c_str(), size - 1);
+        message[size - 1] = '\0';  // Ensure null terminator at the end of the string
+    }
+}
+
+inline std::string
+get_last_error_string_() {
+    const size_t size = 65535;
+
+    std::string message;
+
+    message.resize(size);
+
+    get_last_error_string(message.data(), size);
+
+    message.resize(std::strlen(message.c_str()));
+
+    return message;
+}
+
+inline cudnnStatus_t
+set_stream(cudnnHandle_t handle, cudaStream_t stream) {
+    NV_FE_CALL_TO_BACKEND(set_stream, cudnnSetStream, handle, stream);
+}
+
+inline cudnnStatus_t
+get_stream(cudnnHandle_t handle, cudaStream_t *stream) {
+    NV_FE_CALL_TO_BACKEND(get_stream, cudnnGetStream, handle, stream);
+}
+
+inline cudnnStatus_t
+create_filter_desc_v7(cudnnFilterDescriptor_t *filter) {
+    NV_FE_CALL_TO_BACKEND(create_filter_desc_v7, cudnnCreateFilterDescriptor, filter);
+}
+
+inline cudnnStatus_t
+set_ndfilter_desc_v7(cudnnFilterDescriptor_t filter,
+                     cudnnDataType_t type,
+                     cudnnTensorFormat_t format,
+                     int x,
+                     const int filterDimA[]) {
+    NV_FE_CALL_TO_BACKEND(set_ndfilter_desc_v7, cudnnSetFilterNdDescriptor, filter, type, format, x, filterDimA);
+}
+
+inline cudnnStatus_t
+reorder_filter_bias(cudnnHandle_t handle,
+                    const cudnnFilterDescriptor_t filterDesc,
+                    cudnnReorderType_t reorderType,
+                    const void *filterData,
+                    void *reorderedFilterData,
+                    int reorderBias,
+                    const void *biasData,
+                    void *reorderedBiasData) {
+    NV_FE_CALL_TO_BACKEND(reorder_filter_bias,
+                          cudnnReorderFilterAndBias,
+                          handle,
+                          filterDesc,
+                          reorderType,
+                          filterData,
+                          reorderedFilterData,
+                          reorderBias,
+                          biasData,
+                          reorderedBiasData);
+}
+
+inline cudnnStatus_t
+destroy_filter(cudnnFilterDescriptor_t filter) {
+    NV_FE_CALL_TO_BACKEND(destroy_filter, cudnnDestroyFilterDescriptor, filter);
+}
+
+}  // namespace detail
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_utils.h b/third_party/cudnn-frontend/include/cudnn_frontend_utils.h
new file mode 100644
index 00000000..1f5d0521
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_utils.h
@@ -0,0 +1,2510 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include <exception>
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+#include <iomanip>
+#include <sstream>
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+
+#ifndef CUDNN_FRONTEND_SKIP_JSON_LIB
+#include "cudnn_frontend/thirdparty/nlohmann/json.hpp"
+#endif
+
+using json = nlohmann::json;
+
+template <>
+struct nlohmann::adl_serializer<float> {
+    static void
+    to_json(nlohmann::json& j, const float& f) {
+        // Convert float to hexadecimal string
+        unsigned int intValue;
+        std::memcpy(&intValue, &f, sizeof(float));
+
+        std::stringstream stream;
+        stream << std::hex << std::uppercase << std::setw(8) << std::setfill('0') << intValue;
+        j = stream.str();
+    }
+
+    static void
+    from_json(const nlohmann::json& j, float& f) {
+        // Read hexadecimal string and convert back to float
+        std::string hexValueStr = j.get<std::string>();
+        unsigned int hexValue;
+        std::stringstream stream(hexValueStr);
+        stream >> std::hex >> hexValue;
+
+        std::memcpy(&f, &hexValue, sizeof(float));
+    }
+};
+
+template <>
+struct nlohmann::adl_serializer<half> {
+    static void
+    to_json(json& j, const half& opt) {
+        // No precision loss when converting to float
+        j = __half2float(opt);
+    }
+
+    static void
+    from_json(const json& j, half& opt) {
+        opt = __float2half(j.get<float>());
+    }
+};
+
+template <>
+struct nlohmann::adl_serializer<nv_bfloat16> {
+    static void
+    to_json(json& j, const nv_bfloat16& opt) {
+        // No precision loss when converting to float
+        j = __bfloat162float(opt);
+    }
+
+    static void
+    from_json(const json& j, nv_bfloat16& opt) {
+        opt = __float2bfloat16(j.get<float>());
+    }
+};
+
+template <>
+struct nlohmann::adl_serializer<std::variant<int64_t, int32_t, half, float, nv_bfloat16>> {
+    static void
+    to_json(nlohmann::json& j, const std::variant<int64_t, int32_t, half, float, nv_bfloat16>& data) {
+        std::visit([&](const auto& v) { j = {{"index", data.index()}, {"value", v}}; }, data);
+    }
+
+    static void
+    from_json(const nlohmann::json& j, std::variant<int64_t, int32_t, half, float, nv_bfloat16>& data) {
+        if (!j.is_object() || !j.contains("index") || !j.contains("value")) {
+            return;
+        }
+
+        size_t type_index = j.at("index").get<size_t>();
+        if (type_index == 0) {
+            data = j.at("value").get<int32_t>();
+        } else if (type_index == 1) {
+            data = j.at("value").get<int32_t>();
+        } else if (type_index == 2) {
+            data = j.at("value").get<half>();
+        } else if (type_index == 3) {
+            data = j.at("value").get<float>();
+        } else if (type_index == 4) {
+            data = j.at("value").get<nv_bfloat16>();
+        } else {
+            return;
+        }
+    }
+};
+
+// Specialization of nlohmann::adl_serializer for std::optional<T>
+template <typename T>
+struct nlohmann::adl_serializer<std::optional<T>> {
+    static void
+    to_json(json& j, const std::optional<T>& opt) {
+        if (opt.has_value())
+            j = *opt;
+        else
+            j = nullptr;
+    }
+
+    static void
+    from_json(const json& j, std::optional<T>& opt) {
+        if (!j.is_null())
+            opt = j.get<T>();
+        else
+            opt.reset();
+    }
+};
+
+// Specialization of nlohmann::adl_serializer for std::shared_ptr<T>
+template <typename T>
+struct nlohmann::adl_serializer<std::shared_ptr<T>> {
+    static void
+    to_json(json& j, const std::shared_ptr<T>& ptr) {
+        if (ptr)
+            j = *ptr;
+        else
+            j = nullptr;
+    }
+
+    static void
+    from_json(const json& j, std::shared_ptr<T>& ptr) {
+        if (!j.is_null())
+            ptr = std::make_shared<T>(j.get<T>());
+        else
+            ptr.reset();
+    }
+};
+
+// Specialization of nlohmann::adl_serializer for cudnnFraction_t
+template <>
+struct nlohmann::adl_serializer<cudnnFraction_t> {
+    static void
+    to_json(json& j, const cudnnFraction_t& fraction) {
+        j = fraction.numerator;
+    }
+
+    static void
+    from_json(const json& j, cudnnFraction_t& fraction) {
+        fraction.numerator = j;
+    }
+};
+
+#else
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)
+#endif
+
+#include "cudnn_frontend_shim.h"
+#include "cudnn_backend_base.h"
+#include "cudnn_frontend_Logging.h"
+
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+#ifdef _MSC_VER
+#pragma warning(disable : 4702)  // if exceptions are enabled there are unreachable return statements
+#endif
+#endif
+
+#define CUDNN_FRONTEND_UNUSED(X) ((void)X)
+namespace cudnn_frontend {
+
+/// Detailed feature_vector. Generally the Tensor and Operation properties
+using feature_vector_t = std::vector<int64_t>;
+
+class cudnnException : public std::runtime_error {
+   public:
+    cudnnException(const char* message, cudnnStatus_t status) throw() : std::runtime_error(message) {
+        error_status = status;
+    }
+    virtual const char*
+    what() const throw() {
+        return std::runtime_error::what();
+    }
+    cudnnStatus_t
+    getCudnnStatus() {
+        return error_status;
+    }
+
+    cudnnStatus_t error_status;
+};
+
+static inline bool
+AllowAll(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+static inline std::string
+to_string(cudnnStatus_t const status) {
+    return detail::get_error_string(status);
+}
+
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+[[noreturn]]
+#endif
+static inline void
+set_error_and_throw_exception(BackendDescriptor const* desc, cudnnStatus_t status, const char* message) {
+
+    std::string padded_message = std::string(message) + detail::get_last_error_string_();
+    if (desc != nullptr) {
+        desc->set_status(status);
+        desc->set_error(padded_message.c_str());
+    }
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+    throw cudnnException(
+        std::string(std::string(padded_message) + std::string(" cudnn_status: ") + to_string(status)).c_str(), status);
+#endif
+}
+
+static inline std::string
+to_string(cudnnBackendBehaviorNote_t note) {
+    switch (note) {
+        case CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION:
+            return std::string("CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION");
+        case CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER:
+            return std::string("CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER");
+        case CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER:
+            return std::string("CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER");
+        case CUDNN_BEHAVIOR_NOTE_TYPE_COUNT:
+            return std::string("CUDNN_BEHAVIOR_NOTE_TYPE_COUNT");
+            // If none of the above cases hit, its definitely strict nan prop and should raise an error.
+#if (CUDNN_VERSION >= 90500)
+        case CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API:
+            return std::string("CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API");
+#endif
+#if (CUDNN_VERSION >= 91500)
+        case CUDNN_BEHAVIOR_NOTE_CUBLASLT_DEPENDENCY:
+            return std::string("CUDNN_BEHAVIOR_NOTE_CUBLASLT_DEPENDENCY");
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return std::string("UNKNOWN_BEHAVIOR_NOTE");
+#endif
+    }
+    return std::string("INVALID_BEHAVIOR_NOTE");
+}
+
+static inline std::string
+to_string(cudnnBackendNumericalNote_t note) {
+    switch (note) {
+        case CUDNN_NUMERICAL_NOTE_TENSOR_CORE:
+            return std::string("CUDNN_NUMERICAL_NOTE_TENSOR_CORE");
+        case CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS:
+            return std::string("CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS");
+        case CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION:
+            return std::string("CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION");
+        case CUDNN_NUMERICAL_NOTE_FFT:
+            return std::string("CUDNN_NUMERICAL_NOTE_FFT");
+        case CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC:
+            return std::string("CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC");
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD:
+            return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD");
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4:
+            return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4");
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6:
+            return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6");
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13:
+            return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13");
+        case CUDNN_NUMERICAL_NOTE_TYPE_COUNT:
+            return std::string("CUDNN_NUMERICAL_NOTE_TYPE_COUNT");
+
+            // If none of the above cases hit, its definitely strict nan prop and should raise an error.
+#if (CUDNN_VERSION >= 90100)
+        case CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP:
+            return std::string("CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP");
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return std::string("UNKNOWN_NUMERICAL_NOTE");
+#endif
+    }
+    return std::string("INVALID_NUMERICAL_NOTE");
+}
+
+#if (CUDNN_VERSION >= 8700)
+static inline std::string
+to_string(cudnnRngDistribution_t distribution) {
+    switch (distribution) {
+        case CUDNN_RNG_DISTRIBUTION_BERNOULLI:
+            return std::string("CUDNN_RNG_DISTRIBUTION_BERNOULLI");
+        case CUDNN_RNG_DISTRIBUTION_UNIFORM:
+            return std::string("CUDNN_RNG_DISTRIBUTION_UNIFORM");
+        case CUDNN_RNG_DISTRIBUTION_NORMAL:
+            return std::string("CUDNN_RNG_DISTRIBUTION_NORMAL");
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return std::string("UNKNOWN_CUDNN_DISTRIBUTION");
+#endif
+    }
+    return std::string("");
+}
+#endif
+
+enum class BuildPlanPolicy_t {
+    // Builds and stores the "first successful" plan from the list returned by heuristics.
+    // heuristics list is traversed sequentially and in decreasing order of potential performance.
+    HEURISTICS_CHOICE,
+    // Builds and stores all the "successful" plans from the list returned by heuristics.
+    ALL,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(BuildPlanPolicy_t,
+                             {
+                                 {BuildPlanPolicy_t::HEURISTICS_CHOICE, "HEURISTICS_CHOICE"},
+                                 {BuildPlanPolicy_t::ALL, "ALL"},
+                             })
+
+enum class TensorReordering_t {
+    NONE,
+    INT8x32,
+    F16x16,
+    F8_128x4,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(TensorReordering_t,
+                             {
+                                 {TensorReordering_t::NONE, "NONE"},
+                                 {TensorReordering_t::INT8x32, "INT8x32"},
+                                 {TensorReordering_t::F16x16, "F16x16"},
+                                 {TensorReordering_t::F8_128x4, "F8_128x4"},
+                             })
+
+enum class ResampleMode_t {
+    NOT_SET,
+
+    AVGPOOL_EXCLUDE_PADDING,
+    AVGPOOL_INCLUDE_PADDING,
+    BILINEAR,
+    NEAREST,
+    MAXPOOL,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(ResampleMode_t,
+                             {
+                                 {ResampleMode_t::NOT_SET, nullptr},
+                                 {ResampleMode_t::AVGPOOL_EXCLUDE_PADDING, "AVGPOOL_EXCLUDE_PADDING"},
+                                 {ResampleMode_t::AVGPOOL_INCLUDE_PADDING, "AVGPOOL_INCLUDE_PADDING"},
+                                 {ResampleMode_t::BILINEAR, "BILINEAR"},
+                                 {ResampleMode_t::NEAREST, "NEAREST"},
+                                 {ResampleMode_t::MAXPOOL, "MAXPOOL"},
+                             })
+
+enum class PaddingMode_t {
+    NOT_SET,
+
+    EDGE_VAL_PAD,
+    NEG_INF_PAD,
+    ZERO_PAD
+};
+
+enum class ConvolutionMode_t {
+    NOT_SET,
+
+    CONVOLUTION,
+    CROSS_CORRELATION,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(ConvolutionMode_t,
+                             {
+                                 {ConvolutionMode_t::CONVOLUTION, "CONVOLUTION"},
+                                 {ConvolutionMode_t::CROSS_CORRELATION, "CROSS_CORRELATION"},
+                             })
+
+NLOHMANN_JSON_SERIALIZE_ENUM(PaddingMode_t,
+                             {
+                                 {PaddingMode_t::NOT_SET, nullptr},
+                                 {PaddingMode_t::EDGE_VAL_PAD, "EDGE_VAL_PAD"},
+                                 {PaddingMode_t::NEG_INF_PAD, "NEG_INF_PAD"},
+                                 {PaddingMode_t::ZERO_PAD, "ZERO_PAD"},
+                             })
+
+enum class NormFwdPhase_t {
+    NOT_SET,
+
+    INFERENCE,
+    TRAINING
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(NormFwdPhase_t,
+                             {
+                                 {NormFwdPhase_t::NOT_SET, nullptr},
+                                 {NormFwdPhase_t::INFERENCE, "INFERENCE"},
+                                 {NormFwdPhase_t::TRAINING, "TRAINING"},
+                             })
+
+enum class MoeGroupedMatmulMode_t {
+    NOT_SET,
+
+    NONE,
+    GATHER,
+    SCATTER
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(MoeGroupedMatmulMode_t,
+                             {
+                                 {MoeGroupedMatmulMode_t::NOT_SET, nullptr},
+                                 {MoeGroupedMatmulMode_t::NONE, "NONE"},
+                                 {MoeGroupedMatmulMode_t::GATHER, "GATHER"},
+                                 {MoeGroupedMatmulMode_t::SCATTER, "SCATTER"},
+                             })
+
+enum class DescriptorType_t {
+    NOT_SET,
+
+    POINTWISE_DESCRIPTOR,
+    CONVOLUTION_DESCRIPTOR,
+    ENGINE_DESCRIPTOR,
+    ENGINECFG_DESCRIPTOR,
+    ENGINEHEUR_DESCRIPTOR,
+    EXECUTION_PLAN_DESCRIPTOR,
+    INTERMEDIATE_INFO_DESCRIPTOR,
+    KNOB_CHOICE_DESCRIPTOR,
+    KNOB_INFO_DESCRIPTOR,
+    LAYOUT_INFO_DESCRIPTOR,
+    OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+    OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+    OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+    OPERATION_POINTWISE_DESCRIPTOR,
+    OPERATION_GEN_STATS_DESCRIPTOR,
+    OPERATIONGRAPH_DESCRIPTOR,
+    VARIANT_PACK_DESCRIPTOR,
+    TENSOR_DESCRIPTOR,
+    MATMUL_DESCRIPTOR,
+    OPERATION_MATMUL_DESCRIPTOR,
+    OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
+    REDUCTION_DESCRIPTOR,
+    OPERATION_REDUCTION_DESCRIPTOR,
+    OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
+    RESAMPLE_DESCRIPTOR,
+    OPERATION_RESAMPLE_FWD_DESCRIPTOR,
+    OPERATION_RESAMPLE_BWD_DESCRIPTOR,
+    OPERATION_CONCAT_DESCRIPTOR,
+    OPERATION_SIGNAL_DESCRIPTOR,
+    OPERATION_NORM_FORWARD_DESCRIPTOR,
+    OPERATION_NORM_BACKWARD_DESCRIPTOR,
+    OPERATION_RESHAPE_DESCRIPTOR,
+    RNG_DESCRIPTOR,
+    OPERATION_RNG_DESCRIPTOR,
+    OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR,
+    OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR,
+    OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR,
+    OPERATION_CONCATENATE_DESCRIPTOR,
+    OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR
+};
+
+enum class NormMode_t {
+    NOT_SET,
+
+    LAYER_NORM,
+    INSTANCE_NORM,
+    BATCH_NORM,
+    GROUP_NORM,
+    RMS_NORM,
+    ADA_LAYER_NORM,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(NormMode_t,
+                             {
+                                 {NormMode_t::NOT_SET, nullptr},
+                                 {NormMode_t::LAYER_NORM, "LAYER_NORM"},
+                                 {NormMode_t::INSTANCE_NORM, "INSTANCE_NORM"},
+                                 {NormMode_t::BATCH_NORM, "BATCH_NORM"},
+                                 {NormMode_t::GROUP_NORM, "GROUP_NORM"},
+                                 {NormMode_t::RMS_NORM, "RMS_NORM"},
+                                 {NormMode_t::ADA_LAYER_NORM, "ADA_LAYER_NORM"},
+                             })
+
+enum class PointwiseMode_t {
+    NOT_SET,
+
+    ADD,
+    MUL,
+    SQRT,
+    MAX,
+    MIN,
+    RELU_FWD,
+    TANH_FWD,
+    SIGMOID_FWD,
+    ELU_FWD,
+    GELU_FWD,
+    SOFTPLUS_FWD,
+    SWISH_FWD,
+    RELU_BWD,
+    TANH_BWD,
+    SIGMOID_BWD,
+    ELU_BWD,
+    GELU_BWD,
+    SOFTPLUS_BWD,
+    SWISH_BWD,
+    ERF,
+    IDENTITY,
+    GELU_APPROX_TANH_BWD,
+    GELU_APPROX_TANH_FWD,
+    GEN_INDEX,
+    BINARY_SELECT,
+    EXP,
+    LOG,
+    NEG,
+    MOD,
+    POW,
+    ABS,
+    CEIL,
+    COS,
+    FLOOR,
+    RSQRT,
+    SIN,
+    LOGICAL_NOT,
+    TAN,
+    SUB,
+    ADD_SQUARE,
+    DIV,
+    CMP_EQ,
+    CMP_NEQ,
+    CMP_GT,
+    CMP_GE,
+    CMP_LT,
+    CMP_LE,
+    LOGICAL_AND,
+    LOGICAL_OR,
+    RECIPROCAL,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(PointwiseMode_t,
+                             {
+                                 {PointwiseMode_t::NOT_SET, nullptr},
+                                 {PointwiseMode_t::ADD, "ADD"},
+                                 {PointwiseMode_t::MUL, "MUL"},
+                                 {PointwiseMode_t::SQRT, "SQRT"},
+                                 {PointwiseMode_t::MAX, "MAX"},
+                                 {PointwiseMode_t::MIN, "MIN"},
+                                 {PointwiseMode_t::RELU_FWD, "RELU_FWD"},
+                                 {PointwiseMode_t::TANH_FWD, "TANH_FWD"},
+                                 {PointwiseMode_t::SIGMOID_FWD, "SIGMOID_FWD"},
+                                 {PointwiseMode_t::ELU_FWD, "ELU_FWD"},
+                                 {PointwiseMode_t::GELU_FWD, "GELU_FWD"},
+                                 {PointwiseMode_t::SOFTPLUS_FWD, "SOFTPLUS_FWD"},
+                                 {PointwiseMode_t::SWISH_FWD, "SWISH_FWD"},
+                                 {PointwiseMode_t::RELU_BWD, "RELU_BWD"},
+                                 {PointwiseMode_t::TANH_BWD, "TANH_BWD"},
+                                 {PointwiseMode_t::SIGMOID_BWD, "SIGMOID_BWD"},
+                                 {PointwiseMode_t::ELU_BWD, "ELU_BWD"},
+                                 {PointwiseMode_t::GELU_BWD, "GELU_BWD"},
+                                 {PointwiseMode_t::SOFTPLUS_BWD, "SOFTPLUS_BWD"},
+                                 {PointwiseMode_t::SWISH_BWD, "SWISH_BWD"},
+                                 {PointwiseMode_t::ERF, "ERF"},
+                                 {PointwiseMode_t::IDENTITY, "IDENTITY"},
+                                 {PointwiseMode_t::GELU_APPROX_TANH_BWD, "GELU_APPROX_TANH_BWD"},
+                                 {PointwiseMode_t::GELU_APPROX_TANH_FWD, "GELU_APPROX_TANH_FWD"},
+                                 {PointwiseMode_t::GEN_INDEX, "GEN_INDEX"},
+                                 {PointwiseMode_t::BINARY_SELECT, "BINARY_SELECT"},
+                                 {PointwiseMode_t::EXP, "EXP"},
+                                 {PointwiseMode_t::LOG, "LOG"},
+                                 {PointwiseMode_t::NEG, "NEG"},
+                                 {PointwiseMode_t::MOD, "MOD"},
+                                 {PointwiseMode_t::POW, "POW"},
+                                 {PointwiseMode_t::ABS, "ABS"},
+                                 {PointwiseMode_t::CEIL, "CEIL"},
+                                 {PointwiseMode_t::COS, "COS"},
+                                 {PointwiseMode_t::FLOOR, "FLOOR"},
+                                 {PointwiseMode_t::RSQRT, "RSQRT"},
+                                 {PointwiseMode_t::SIN, "SIN"},
+                                 {PointwiseMode_t::LOGICAL_NOT, "LOGICAL_NOT"},
+                                 {PointwiseMode_t::TAN, "TAN"},
+                                 {PointwiseMode_t::SUB, "SUB"},
+                                 {PointwiseMode_t::ADD_SQUARE, "ADD_SQUARE"},
+                                 {PointwiseMode_t::DIV, "DIV"},
+                                 {PointwiseMode_t::CMP_EQ, "CMP_EQ"},
+                                 {PointwiseMode_t::CMP_NEQ, "CMP_NEQ"},
+                                 {PointwiseMode_t::CMP_GT, "CMP_GT"},
+                                 {PointwiseMode_t::CMP_GE, "CMP_GE"},
+                                 {PointwiseMode_t::CMP_LT, "CMP_LT"},
+                                 {PointwiseMode_t::CMP_LE, "CMP_LE"},
+                                 {PointwiseMode_t::LOGICAL_AND, "LOGICAL_AND"},
+                                 {PointwiseMode_t::LOGICAL_OR, "LOGICAL_OR"},
+                                 {PointwiseMode_t::RECIPROCAL, "RECIPROCAL"},
+                             })
+
+enum class HeurMode_t {
+    A,
+    B,
+    FALLBACK,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(HeurMode_t,
+                             {
+                                 {HeurMode_t::A, "A"},
+                                 {HeurMode_t::B, "B"},
+                                 {HeurMode_t::FALLBACK, "FALLBACK"},
+                             })
+
+enum class BehaviorNote_t {
+    NOT_SET,
+
+    RUNTIME_COMPILATION,
+    REQUIRES_FILTER_INT8x32_REORDER,
+    REQUIRES_BIAS_INT8x32_REORDER,
+    SUPPORTS_CUDA_GRAPH_NATIVE_API,
+    CUBLASLT_DEPENDENCY,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(BehaviorNote_t,
+                             {
+                                 {BehaviorNote_t::NOT_SET, "NOT_SET"},
+                                 {BehaviorNote_t::RUNTIME_COMPILATION, "RUNTIME_COMPILATION"},
+                                 {BehaviorNote_t::REQUIRES_FILTER_INT8x32_REORDER, "REQUIRES_FILTER_INT8x32_REORDER"},
+                                 {BehaviorNote_t::REQUIRES_BIAS_INT8x32_REORDER, "REQUIRES_BIAS_INT8x32_REORDER"},
+                                 {BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API, "SUPPORTS_CUDA_GRAPH_NATIVE_API"},
+                                 {BehaviorNote_t::CUBLASLT_DEPENDENCY, "CUBLASLT_DEPENDENCY"},
+                             })
+
+enum class NumericalNote_t {
+    NOT_SET,
+
+    TENSOR_CORE,
+    DOWN_CONVERT_INPUTS,
+    REDUCED_PRECISION_REDUCTION,
+    FFT,
+    NONDETERMINISTIC,
+    WINOGRAD,
+    WINOGRAD_TILE_4x4,
+    WINOGRAD_TILE_6x6,
+    WINOGRAD_TILE_13x13,
+    STRICT_NAN_PROP,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(NumericalNote_t,
+                             {
+                                 {NumericalNote_t::NOT_SET, "NOT_SET"},
+                                 {NumericalNote_t::TENSOR_CORE, "TENSOR_CORE"},
+                                 {NumericalNote_t::DOWN_CONVERT_INPUTS, "DOWN_CONVERT_INPUTS"},
+                                 {NumericalNote_t::REDUCED_PRECISION_REDUCTION, "REDUCED_PRECISION_REDUCTION"},
+                                 {NumericalNote_t::FFT, "FFT"},
+                                 {NumericalNote_t::NONDETERMINISTIC, "NONDETERMINISTIC"},
+                                 {NumericalNote_t::WINOGRAD, "WINOGRAD"},
+                                 {NumericalNote_t::WINOGRAD_TILE_4x4, "WINOGRAD_TILE_4x4"},
+                                 {NumericalNote_t::WINOGRAD_TILE_6x6, "WINOGRAD_TILE_6x6"},
+                                 {NumericalNote_t::WINOGRAD_TILE_13x13, "WINOGRAD_TILE_13x13"},
+                                 {NumericalNote_t::STRICT_NAN_PROP, "STRICT_NAN_PROP"},
+                             })
+
+enum class DataType_t {
+    NOT_SET,
+
+    FLOAT,
+    DOUBLE,
+    HALF,
+    INT8,
+    INT32,
+    INT8x4,
+    UINT8,
+    UINT8x4,
+    INT8x32,
+    BFLOAT16,
+    INT64,
+    BOOLEAN,
+    FP8_E4M3,
+    FP8_E5M2,
+    FAST_FLOAT_FOR_FP8,
+    FP8_E8M0,
+    FP4_E2M1,
+    INT4,
+    COMPLEX_FP32,
+    COMPLEX_FP64,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(DataType_t,
+                             {
+                                 {DataType_t::NOT_SET, nullptr},
+                                 {DataType_t::FLOAT, "FLOAT"},
+                                 {DataType_t::DOUBLE, "DOUBLE"},
+                                 {DataType_t::HALF, "HALF"},
+                                 {DataType_t::INT8, "INT8"},
+                                 {DataType_t::INT32, "INT32"},
+                                 {DataType_t::INT8x4, "INT8x4"},
+                                 {DataType_t::UINT8, "UINT8"},
+                                 {DataType_t::UINT8x4, "UINT8x4"},
+                                 {DataType_t::INT8x32, "INT8x32"},
+                                 {DataType_t::BFLOAT16, "BFLOAT16"},
+                                 {DataType_t::INT64, "INT64"},
+                                 {DataType_t::BOOLEAN, "BOOLEAN"},
+                                 {DataType_t::FP8_E4M3, "FP8_E4M3"},
+                                 {DataType_t::FP8_E5M2, "FP8_E5M2"},
+                                 {DataType_t::FAST_FLOAT_FOR_FP8, "FAST_FLOAT_FOR_FP8"},
+                                 {DataType_t::FP8_E8M0, "FP8_E8M0"},
+                                 {DataType_t::FP4_E2M1, "FP4_E2M1"},
+                                 {DataType_t::INT4, "INT4"},
+                                 {DataType_t::COMPLEX_FP32, "COMPLEX_FP32"},
+                                 {DataType_t::COMPLEX_FP64, "COMPLEX_FP64"},
+
+                             })
+
+enum class ReductionMode_t {
+    NOT_SET,
+
+    ADD,
+    MUL,
+    MIN,
+    MAX,
+    AMAX,
+    AVG,
+    NORM1,
+    NORM2,
+    MUL_NO_ZEROS
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(ReductionMode_t,
+                             {
+                                 {ReductionMode_t::NOT_SET, nullptr},
+                                 {ReductionMode_t::ADD, "ADD"},
+                                 {ReductionMode_t::MUL, "MUL"},
+                                 {ReductionMode_t::MIN, "MIN"},
+                                 {ReductionMode_t::MAX, "MAX"},
+                                 {ReductionMode_t::AMAX, "AMAX"},
+                                 {ReductionMode_t::AVG, "AVG"},
+                                 {ReductionMode_t::NORM1, "NORM1"},
+                                 {ReductionMode_t::NORM2, "NORM2"},
+                                 {ReductionMode_t::MUL_NO_ZEROS, "MUL_NO_ZEROS"},
+                             })
+
+enum class RngDistribution_t {
+    NOT_SET,
+
+    BERNOULLI,
+    UNIFORM,
+    NORMAL,
+};
+
+NLOHMANN_JSON_SERIALIZE_ENUM(RngDistribution_t,
+                             {
+                                 {RngDistribution_t::NOT_SET, nullptr},
+                                 {RngDistribution_t::BERNOULLI, "BERNOULLI"},
+                                 {RngDistribution_t::UNIFORM, "UNIFORM"},
+                                 {RngDistribution_t::NORMAL, "NORMAL"},
+                             })
+
+static int64_t
+get_pointwise_mode_port_count(PointwiseMode_t const& mode) {
+    switch (mode) {
+        case PointwiseMode_t::NOT_SET:
+            return 0;
+
+        case PointwiseMode_t::ADD:
+        case PointwiseMode_t::MUL:
+        case PointwiseMode_t::DIV:
+        case PointwiseMode_t::ADD_SQUARE:
+        case PointwiseMode_t::SUB:
+        case PointwiseMode_t::CMP_EQ:
+        case PointwiseMode_t::CMP_NEQ:
+        case PointwiseMode_t::CMP_GT:
+        case PointwiseMode_t::CMP_GE:
+        case PointwiseMode_t::CMP_LT:
+        case PointwiseMode_t::CMP_LE:
+        case PointwiseMode_t::LOGICAL_AND:
+        case PointwiseMode_t::LOGICAL_OR:
+        case PointwiseMode_t::MIN:
+        case PointwiseMode_t::MAX:
+        case PointwiseMode_t::MOD:
+        case PointwiseMode_t::RELU_BWD:
+        case PointwiseMode_t::TANH_BWD:
+        case PointwiseMode_t::SIGMOID_BWD:
+        case PointwiseMode_t::ELU_BWD:
+        case PointwiseMode_t::GELU_BWD:
+        case PointwiseMode_t::SOFTPLUS_BWD:
+        case PointwiseMode_t::SWISH_BWD:
+        case PointwiseMode_t::GELU_APPROX_TANH_BWD:
+        case PointwiseMode_t::POW:
+            return 3;
+
+        case PointwiseMode_t::SQRT:
+        case PointwiseMode_t::RELU_FWD:
+        case PointwiseMode_t::TANH_FWD:
+        case PointwiseMode_t::SIGMOID_FWD:
+        case PointwiseMode_t::ELU_FWD:
+        case PointwiseMode_t::GELU_FWD:
+        case PointwiseMode_t::SOFTPLUS_FWD:
+        case PointwiseMode_t::SWISH_FWD:
+        case PointwiseMode_t::EXP:
+        case PointwiseMode_t::LOG:
+        case PointwiseMode_t::NEG:
+        case PointwiseMode_t::ABS:
+        case PointwiseMode_t::CEIL:
+        case PointwiseMode_t::FLOOR:
+        case PointwiseMode_t::COS:
+        case PointwiseMode_t::TAN:
+        case PointwiseMode_t::SIN:
+        case PointwiseMode_t::RSQRT:
+        case PointwiseMode_t::LOGICAL_NOT:
+        case PointwiseMode_t::GEN_INDEX:
+        case PointwiseMode_t::ERF:
+        case PointwiseMode_t::GELU_APPROX_TANH_FWD:
+        case PointwiseMode_t::IDENTITY:
+        case PointwiseMode_t::RECIPROCAL:
+            return 2;
+
+        case PointwiseMode_t::BINARY_SELECT:
+            return 4;
+    }
+    return -1;
+}
+
+static inline std::ostream&
+operator<<(std::ostream& os, const DescriptorType_t& mode) {
+    switch (mode) {
+        case DescriptorType_t::POINTWISE_DESCRIPTOR:
+            os << "POINTWISE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::CONVOLUTION_DESCRIPTOR:
+            os << "CONVOLUTION_DESCRIPTOR";
+            break;
+        case DescriptorType_t::ENGINE_DESCRIPTOR:
+            os << "ENGINE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::ENGINECFG_DESCRIPTOR:
+            os << "ENGINECFG_DESCRIPTOR";
+            break;
+        case DescriptorType_t::ENGINEHEUR_DESCRIPTOR:
+            os << "ENGINEHEUR_DESCRIPTOR";
+            break;
+        case DescriptorType_t::EXECUTION_PLAN_DESCRIPTOR:
+            os << "EXECUTION_PLAN_DESCRIPTOR";
+            break;
+        case DescriptorType_t::INTERMEDIATE_INFO_DESCRIPTOR:
+            os << "INTERMEDIATE_INFO_DESCRIPTOR";
+            break;
+        case DescriptorType_t::KNOB_CHOICE_DESCRIPTOR:
+            os << "KNOB_CHOICE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::KNOB_INFO_DESCRIPTOR:
+            os << "KNOB_INFO_DESCRIPTOR";
+            break;
+        case DescriptorType_t::LAYOUT_INFO_DESCRIPTOR:
+            os << "LAYOUT_INFO_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR:
+            os << "OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR:
+            os << "OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR:
+            os << "OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR:
+            os << "OPERATION_POINTWISE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR:
+            os << "OPERATION_GEN_STATS_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATIONGRAPH_DESCRIPTOR:
+            os << "OPERATIONGRAPH_DESCRIPTOR";
+            break;
+        case DescriptorType_t::VARIANT_PACK_DESCRIPTOR:
+            os << "VARIANT_PACK_DESCRIPTOR";
+            break;
+        case DescriptorType_t::TENSOR_DESCRIPTOR:
+            os << "TENSOR_DESCRIPTOR";
+            break;
+        case DescriptorType_t::MATMUL_DESCRIPTOR:
+            os << "MATMUL_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR:
+            os << "OPERATION_MATMUL_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR:
+            os << "OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR";
+            break;
+        case DescriptorType_t::REDUCTION_DESCRIPTOR:
+            os << "REDUCTION_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR:
+            os << "OPERATION_REDUCTION_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR:
+            os << "OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR";
+            break;
+        case DescriptorType_t::RESAMPLE_DESCRIPTOR:
+            os << "RESAMPLE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR:
+            os << "OPERATION_RESAMPLE_FWD_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR:
+            os << "OPERATION_RESAMPLE_BWD_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_CONCAT_DESCRIPTOR:
+            os << "OPERATION_CONCAT_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_SIGNAL_DESCRIPTOR:
+            os << "OPERATION_SIGNAL_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR:
+            os << "OPERATION_NORM_FORWARD_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR:
+            os << "OPERATION_NORM_BACKWARD_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR:
+            os << "OPERATION_RESHAPE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::RNG_DESCRIPTOR:
+            os << "RNG_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_RNG_DESCRIPTOR:
+            os << "OPERATION_RNG_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR:
+            os << "OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR:
+            os << "OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR:
+            os << "OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_CONCATENATE_DESCRIPTOR:
+            os << "OPERATION_CONCATENATE_DESCRIPTOR";
+            break;
+        case DescriptorType_t::OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR:
+            os << "OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR";
+            break;
+        case DescriptorType_t::NOT_SET:
+            os << "NOT_SET";
+            break;
+    }
+    return os;
+}
+
+enum class DiagonalAlignment_t { TOP_LEFT, BOTTOM_RIGHT };
+NLOHMANN_JSON_SERIALIZE_ENUM(DiagonalAlignment_t,
+                             {
+                                 {DiagonalAlignment_t::TOP_LEFT, "TOP_LEFT"},
+                                 {DiagonalAlignment_t::BOTTOM_RIGHT, "BOTTOM_RIGHT"},
+                             })
+
+enum class AttentionImplementation_t { AUTO, COMPOSITE, UNIFIED };
+NLOHMANN_JSON_SERIALIZE_ENUM(AttentionImplementation_t,
+                             {
+                                 {AttentionImplementation_t::AUTO, "AUTO"},
+                                 {AttentionImplementation_t::COMPOSITE, "COMPOSITE"},
+                                 {AttentionImplementation_t::UNIFIED, "UNIFIED"},
+                             })
+
+namespace detail {
+
+inline size_t
+get_data_type_size(DataType_t const data_type) {
+    switch (data_type) {
+        case DataType_t::FLOAT:
+            return sizeof(float);
+        case DataType_t::DOUBLE:
+            return sizeof(double);
+        case DataType_t::HALF:
+            return 2;  // 16-bit float
+        case DataType_t::INT8:
+        case DataType_t::UINT8:
+            return 1;
+        case DataType_t::INT32:
+            return sizeof(int32_t);
+        case DataType_t::INT8x4:
+        case DataType_t::UINT8x4:
+            return 4;
+        case DataType_t::INT8x32:
+            return 32;
+        case DataType_t::BFLOAT16:
+            return 2;
+        case DataType_t::INT64:
+            return sizeof(int64_t);
+        case DataType_t::FP8_E4M3:
+        case DataType_t::FP8_E5M2:
+            return 1;  // 8-bit float
+        case DataType_t::NOT_SET:
+        case DataType_t::BOOLEAN:
+        default:
+            return 0;
+    }
+}
+
+inline std::vector<float>
+get_alibi_slope(int64_t const n_heads) {
+    std::vector<float> slope;
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4244)  // this could be ommited with c++17 and contexpr
+#endif
+    int n = 1 << static_cast<int>(log2(static_cast<double>(n_heads)));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+    for (int i = 0; i < n; i++) {
+        slope.push_back((float)(i + 1.0));
+    }
+
+    for (int i = 0; i < 2 * (n_heads - n); i += 2) {
+        slope.push_back(static_cast<float>(i + 1) * 0.5f);
+    }
+
+    for (float& elem : slope) {
+        elem *= -8.0f;
+        elem /= static_cast<float>(n);
+        elem = powf(2.0, elem);
+    }
+
+    return slope;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::DataType_t const mode, cudnnDataType_t& cudnn_mode) {
+    switch (mode) {
+        case DataType_t::FLOAT:
+            cudnn_mode = CUDNN_DATA_FLOAT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::DOUBLE:
+            cudnn_mode = CUDNN_DATA_DOUBLE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::HALF:
+            cudnn_mode = CUDNN_DATA_HALF;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::INT8:
+            cudnn_mode = CUDNN_DATA_INT8;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::INT32:
+            cudnn_mode = CUDNN_DATA_INT32;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::INT8x4:
+            cudnn_mode = CUDNN_DATA_INT8x4;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::UINT8:
+            cudnn_mode = CUDNN_DATA_UINT8;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::UINT8x4:
+            cudnn_mode = CUDNN_DATA_UINT8x4;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::INT8x32:
+            cudnn_mode = CUDNN_DATA_INT8x32;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::BFLOAT16:
+            cudnn_mode = CUDNN_DATA_BFLOAT16;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::INT64:
+            cudnn_mode = CUDNN_DATA_INT64;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::BOOLEAN:
+            cudnn_mode = CUDNN_DATA_BOOLEAN;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DataType_t::FP8_E4M3:
+#if (CUDNN_VERSION >= 8600)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8600, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_FP8_E4M3;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::FP8_E5M2:
+#if (CUDNN_VERSION >= 8600)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8600, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_FP8_E5M2;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::FAST_FLOAT_FOR_FP8:
+#if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_FAST_FLOAT_FOR_FP8;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::FP8_E8M0:
+#if (CUDNN_VERSION >= 90700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_FP8_E8M0;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::FP4_E2M1:
+#if (CUDNN_VERSION >= 90700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_FP4_E2M1;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::INT4:
+#if (CUDNN_VERSION >= 91100)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91000, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_INT4;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::COMPLEX_FP32:
+#if (CUDNN_VERSION >= 91400)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91400, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_COMPLEX_FP32;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DataType_t::COMPLEX_FP64:
+#if (CUDNN_VERSION >= 91400)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91400, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_DATA_COMPLEX_FP64;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::ReductionMode_t const mode, cudnnReduceTensorOp_t& cudnn_mode) {
+    switch (mode) {
+        case ReductionMode_t::ADD:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_ADD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::MUL:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_MUL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::MIN:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_MIN;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::MAX:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_MAX;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::AMAX:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_AMAX;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::AVG:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_AVG;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::NORM1:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_NORM1;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::NORM2:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_NORM2;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case ReductionMode_t::MUL_NO_ZEROS:
+            cudnn_mode = CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::PointwiseMode_t const mode, cudnnPointwiseMode_t& cudnn_mode) {
+    switch (mode) {
+        case PointwiseMode_t::ADD:
+            cudnn_mode = CUDNN_POINTWISE_ADD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::MUL:
+            cudnn_mode = CUDNN_POINTWISE_MUL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SQRT:
+            cudnn_mode = CUDNN_POINTWISE_SQRT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::MAX:
+            cudnn_mode = CUDNN_POINTWISE_MAX;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::MIN:
+            cudnn_mode = CUDNN_POINTWISE_MIN;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::RELU_FWD:
+            cudnn_mode = CUDNN_POINTWISE_RELU_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::TANH_FWD:
+            cudnn_mode = CUDNN_POINTWISE_TANH_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SIGMOID_FWD:
+            cudnn_mode = CUDNN_POINTWISE_SIGMOID_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::ELU_FWD:
+            cudnn_mode = CUDNN_POINTWISE_ELU_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::GELU_FWD:
+            cudnn_mode = CUDNN_POINTWISE_GELU_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SOFTPLUS_FWD:
+            cudnn_mode = CUDNN_POINTWISE_SOFTPLUS_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SWISH_FWD:
+            cudnn_mode = CUDNN_POINTWISE_SWISH_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::RELU_BWD:
+            cudnn_mode = CUDNN_POINTWISE_RELU_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::TANH_BWD:
+            cudnn_mode = CUDNN_POINTWISE_TANH_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SIGMOID_BWD:
+            cudnn_mode = CUDNN_POINTWISE_SIGMOID_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::ELU_BWD:
+            cudnn_mode = CUDNN_POINTWISE_ELU_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::GELU_BWD:
+            cudnn_mode = CUDNN_POINTWISE_GELU_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SOFTPLUS_BWD:
+            cudnn_mode = CUDNN_POINTWISE_SOFTPLUS_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SWISH_BWD:
+            cudnn_mode = CUDNN_POINTWISE_SWISH_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::DIV:
+            cudnn_mode = CUDNN_POINTWISE_DIV;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::ADD_SQUARE:
+            cudnn_mode = CUDNN_POINTWISE_ADD_SQUARE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::EXP:
+            cudnn_mode = CUDNN_POINTWISE_EXP;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SUB:
+            cudnn_mode = CUDNN_POINTWISE_SUB;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CMP_EQ:
+            cudnn_mode = CUDNN_POINTWISE_CMP_EQ;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CMP_NEQ:
+            cudnn_mode = CUDNN_POINTWISE_CMP_NEQ;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CMP_GT:
+            cudnn_mode = CUDNN_POINTWISE_CMP_GT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CMP_GE:
+            cudnn_mode = CUDNN_POINTWISE_CMP_GE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CMP_LT:
+            cudnn_mode = CUDNN_POINTWISE_CMP_LT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CMP_LE:
+            cudnn_mode = CUDNN_POINTWISE_CMP_LE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::LOGICAL_AND:
+            cudnn_mode = CUDNN_POINTWISE_LOGICAL_AND;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::LOGICAL_OR:
+            cudnn_mode = CUDNN_POINTWISE_LOGICAL_OR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::LOGICAL_NOT:
+            cudnn_mode = CUDNN_POINTWISE_LOGICAL_NOT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::LOG:
+            cudnn_mode = CUDNN_POINTWISE_LOG;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::NEG:
+            cudnn_mode = CUDNN_POINTWISE_NEG;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::MOD:
+            cudnn_mode = CUDNN_POINTWISE_MOD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::POW:
+            cudnn_mode = CUDNN_POINTWISE_POW;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::ABS:
+            cudnn_mode = CUDNN_POINTWISE_ABS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::CEIL:
+            cudnn_mode = CUDNN_POINTWISE_CEIL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::COS:
+            cudnn_mode = CUDNN_POINTWISE_COS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::FLOOR:
+            cudnn_mode = CUDNN_POINTWISE_FLOOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::RSQRT:
+            cudnn_mode = CUDNN_POINTWISE_RSQRT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::SIN:
+            cudnn_mode = CUDNN_POINTWISE_SIN;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::TAN:
+            cudnn_mode = CUDNN_POINTWISE_TAN;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::GEN_INDEX:
+            cudnn_mode = CUDNN_POINTWISE_GEN_INDEX;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::BINARY_SELECT:
+            cudnn_mode = CUDNN_POINTWISE_BINARY_SELECT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::ERF:
+            cudnn_mode = CUDNN_POINTWISE_ERF;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::IDENTITY:
+            cudnn_mode = CUDNN_POINTWISE_IDENTITY;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::GELU_APPROX_TANH_BWD:
+            cudnn_mode = CUDNN_POINTWISE_GELU_APPROX_TANH_BWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::GELU_APPROX_TANH_FWD:
+            cudnn_mode = CUDNN_POINTWISE_GELU_APPROX_TANH_FWD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case PointwiseMode_t::RECIPROCAL:
+#if (CUDNN_VERSION >= 8900)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8900, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_POINTWISE_RECIPROCAL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::NumericalNote_t const mode, cudnnBackendNumericalNote_t& cudnn_mode) {
+    switch (mode) {
+        case NumericalNote_t::TENSOR_CORE:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_TENSOR_CORE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::DOWN_CONVERT_INPUTS:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::REDUCED_PRECISION_REDUCTION:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::FFT:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_FFT;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::NONDETERMINISTIC:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::WINOGRAD:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_WINOGRAD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::WINOGRAD_TILE_4x4:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::WINOGRAD_TILE_6x6:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::WINOGRAD_TILE_13x13:
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NumericalNote_t::STRICT_NAN_PROP:
+#if (CUDNN_VERSION >= 90100)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90100, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::BehaviorNote_t const mode, cudnnBackendBehaviorNote_t& cudnn_mode) {
+    switch (mode) {
+        case BehaviorNote_t::RUNTIME_COMPILATION:
+            cudnn_mode = CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case BehaviorNote_t::REQUIRES_FILTER_INT8x32_REORDER:
+            cudnn_mode = CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case BehaviorNote_t::REQUIRES_BIAS_INT8x32_REORDER:
+            cudnn_mode = CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API:
+#if (CUDNN_VERSION >= 90500)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90500, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case BehaviorNote_t::CUBLASLT_DEPENDENCY:
+#if (CUDNN_VERSION >= 91500)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91500, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BEHAVIOR_NOTE_CUBLASLT_DEPENDENCY;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnn_frontend::BehaviorNote_t
+convert_from_cudnn_type(cudnnBackendBehaviorNote_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION:
+            return BehaviorNote_t::RUNTIME_COMPILATION;
+        case CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER:
+            return BehaviorNote_t::REQUIRES_FILTER_INT8x32_REORDER;
+        case CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER:
+            return BehaviorNote_t::REQUIRES_BIAS_INT8x32_REORDER;
+#if (CUDNN_VERSION >= 90500)
+        case CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API:
+            return BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API;
+#endif
+#if (CUDNN_VERSION >= 91500)
+        case CUDNN_BEHAVIOR_NOTE_CUBLASLT_DEPENDENCY:
+            return BehaviorNote_t::CUBLASLT_DEPENDENCY;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return BehaviorNote_t::NOT_SET;
+            break;
+#endif
+    }
+    return BehaviorNote_t::NOT_SET;
+}
+
+static inline cudnn_frontend::NumericalNote_t
+convert_from_cudnn_type(cudnnBackendNumericalNote_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_NUMERICAL_NOTE_TENSOR_CORE:
+            return NumericalNote_t::TENSOR_CORE;
+        case CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS:
+            return NumericalNote_t::DOWN_CONVERT_INPUTS;
+        case CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION:
+            return NumericalNote_t::REDUCED_PRECISION_REDUCTION;
+        case CUDNN_NUMERICAL_NOTE_FFT:
+            return NumericalNote_t::FFT;
+        case CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC:
+            return NumericalNote_t::NONDETERMINISTIC;
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD:
+            return NumericalNote_t::WINOGRAD;
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4:
+            return NumericalNote_t::WINOGRAD_TILE_4x4;
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6:
+            return NumericalNote_t::WINOGRAD_TILE_6x6;
+        case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13:
+            return NumericalNote_t::WINOGRAD_TILE_13x13;
+#if (CUDNN_VERSION >= 90100)
+        case CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP:
+            return NumericalNote_t::STRICT_NAN_PROP;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return NumericalNote_t::NOT_SET;
+            break;
+#endif
+    }
+    return NumericalNote_t::NOT_SET;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::DescriptorType_t const mode, cudnnBackendDescriptorType_t& cudnn_mode) {
+    switch (mode) {
+        case DescriptorType_t::POINTWISE_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_POINTWISE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::CONVOLUTION_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::ENGINE_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_ENGINE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::ENGINECFG_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_ENGINECFG_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::ENGINEHEUR_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::EXECUTION_PLAN_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::INTERMEDIATE_INFO_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::KNOB_CHOICE_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::KNOB_INFO_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::LAYOUT_INFO_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATIONGRAPH_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::VARIANT_PACK_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::TENSOR_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_TENSOR_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::MATMUL_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_MATMUL_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::REDUCTION_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_REDUCTION_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::RESAMPLE_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_RESAMPLE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR:
+#if (CUDNN_VERSION >= 8600)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8600, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DescriptorType_t::OPERATION_CONCAT_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_SIGNAL_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR:
+            cudnn_mode = CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR:
+#if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+        case DescriptorType_t::RNG_DESCRIPTOR:
+#if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_RNG_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+        case DescriptorType_t::OPERATION_RNG_DESCRIPTOR:
+#if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+        case DescriptorType_t::OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR:
+#if (CUDNN_VERSION >= 90500)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90500, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+        case DescriptorType_t::OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR:
+#if (CUDNN_VERSION >= 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DescriptorType_t::OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR:
+#if (CUDNN_VERSION >= 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DescriptorType_t::OPERATION_CONCATENATE_DESCRIPTOR:
+#if (CUDNN_VERSION >= 90700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case DescriptorType_t::OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR:
+#if (CUDNN_VERSION >= 91500)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91500, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_BACKEND_OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::ResampleMode_t const mode, cudnnResampleMode_t& cudnn_mode) {
+    switch (mode) {
+#if (CUDNN_VERSION >= 8600)
+        case cudnn_frontend::ResampleMode_t::AVGPOOL_EXCLUDE_PADDING:
+            cudnn_mode = CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::ResampleMode_t::AVGPOOL_INCLUDE_PADDING:
+            cudnn_mode = CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+        case cudnn_frontend::ResampleMode_t::AVGPOOL_INCLUDE_PADDING:
+            cudnn_mode = CUDNN_RESAMPLE_AVGPOOL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+        case cudnn_frontend::ResampleMode_t::BILINEAR:
+            cudnn_mode = CUDNN_RESAMPLE_BILINEAR;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::ResampleMode_t::NEAREST:
+            cudnn_mode = CUDNN_RESAMPLE_NEAREST;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::ResampleMode_t::MAXPOOL:
+            cudnn_mode = CUDNN_RESAMPLE_MAXPOOL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::PaddingMode_t const mode, cudnnPaddingMode_t& cudnn_mode) {
+    switch (mode) {
+        case cudnn_frontend::PaddingMode_t::ZERO_PAD:
+            cudnn_mode = CUDNN_ZERO_PAD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::PaddingMode_t::NEG_INF_PAD:
+            cudnn_mode = CUDNN_NEG_INF_PAD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::PaddingMode_t::EDGE_VAL_PAD:
+            cudnn_mode = CUDNN_EDGE_VAL_PAD;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::NormMode_t const mode, cudnnBackendNormMode_t& cudnn_mode) {
+    switch (mode) {
+        case NormMode_t::LAYER_NORM:
+            cudnn_mode = CUDNN_LAYER_NORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NormMode_t::INSTANCE_NORM:
+            cudnn_mode = CUDNN_INSTANCE_NORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NormMode_t::BATCH_NORM:
+            cudnn_mode = CUDNN_BATCH_NORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NormMode_t::GROUP_NORM:
+            cudnn_mode = CUDNN_GROUP_NORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+
+#if (CUDNN_VERSION >= 8906)
+        case NormMode_t::RMS_NORM:
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8906, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_RMS_NORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#if (CUDNN_VERSION >= 90900)
+        case NormMode_t::ADA_LAYER_NORM:
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(90900, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+            cudnn_mode = CUDNN_ADA_LAYER_NORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::NormFwdPhase_t const mode, cudnnBackendNormFwdPhase_t& cudnn_mode) {
+    switch (mode) {
+        case NormFwdPhase_t::INFERENCE:
+            cudnn_mode = CUDNN_NORM_FWD_INFERENCE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case NormFwdPhase_t::TRAINING:
+            cudnn_mode = CUDNN_NORM_FWD_TRAINING;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+// To be deprecated. Only exists as setResampleMode(cudnnPaddingMode_t) requires it.
+static inline void
+convert_from_cudnn_type(cudnnPaddingMode_t const cudnn_mode, cudnn_frontend::PaddingMode_t& mode) {
+    mode = cudnn_frontend::PaddingMode_t::NOT_SET;
+    switch (cudnn_mode) {
+        case CUDNN_EDGE_VAL_PAD:
+            mode = cudnn_frontend::PaddingMode_t::EDGE_VAL_PAD;
+            break;
+        case CUDNN_NEG_INF_PAD:
+            mode = cudnn_frontend::PaddingMode_t::NEG_INF_PAD;
+            break;
+        case CUDNN_ZERO_PAD:
+            mode = cudnn_frontend::PaddingMode_t::ZERO_PAD;
+            break;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            break;
+#endif
+    }
+}
+
+static inline cudnn_frontend::ConvolutionMode_t
+convert_from_cudnn_type(cudnnConvolutionMode_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_CONVOLUTION:
+            return cudnn_frontend::ConvolutionMode_t::CONVOLUTION;
+        case CUDNN_CROSS_CORRELATION:
+            return cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnn_frontend::ConvolutionMode_t::NOT_SET;
+#endif
+    }
+    return cudnn_frontend::ConvolutionMode_t::NOT_SET;
+}
+
+static inline cudnnConvolutionMode_t
+convert_to_cudnn_type(cudnn_frontend::ConvolutionMode_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case cudnn_frontend::ConvolutionMode_t::CONVOLUTION:
+            return CUDNN_CONVOLUTION;
+        case cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION:
+            return CUDNN_CROSS_CORRELATION;
+        case cudnn_frontend::ConvolutionMode_t::NOT_SET:
+            return CUDNN_CROSS_CORRELATION;
+    }
+    return CUDNN_CROSS_CORRELATION;
+}
+// To be deprecated. Only exists as setResampleMode(cudnnResampleMode_t) requires it.
+static inline void
+convert_from_cudnn_type(cudnnResampleMode_t const cudnn_mode, cudnn_frontend::ResampleMode_t& mode) {
+    mode = cudnn_frontend::ResampleMode_t::NOT_SET;
+    switch (cudnn_mode) {
+#if (CUDNN_VERSION >= 8600)
+        case CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING:
+            mode = cudnn_frontend::ResampleMode_t::AVGPOOL_EXCLUDE_PADDING;
+            break;
+        case CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING:
+            mode = cudnn_frontend::ResampleMode_t::AVGPOOL_INCLUDE_PADDING;
+            break;
+#else
+        case CUDNN_RESAMPLE_AVGPOOL:
+            mode = cudnn_frontend::ResampleMode_t::AVGPOOL_INCLUDE_PADDING;
+            break;
+#endif
+        case CUDNN_RESAMPLE_BILINEAR:
+            mode = cudnn_frontend::ResampleMode_t::BILINEAR;
+            break;
+        case CUDNN_RESAMPLE_NEAREST:
+            mode = cudnn_frontend::ResampleMode_t::NEAREST;
+            break;
+        case CUDNN_RESAMPLE_MAXPOOL:
+            mode = cudnn_frontend::ResampleMode_t::MAXPOOL;
+            break;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            break;
+#endif
+    }
+}
+
+// To be deprecated. Only exists as setNormalizationMode(cudnnBackendNormMode_t) requires it.
+static inline void
+convert_from_cudnn_type(cudnnBackendNormMode_t const cudnn_mode, cudnn_frontend::NormMode_t& mode) {
+    mode = NormMode_t::NOT_SET;
+    switch (cudnn_mode) {
+        case CUDNN_LAYER_NORM:
+            mode = NormMode_t::LAYER_NORM;
+            break;
+        case CUDNN_INSTANCE_NORM:
+            mode = NormMode_t::INSTANCE_NORM;
+            break;
+        case CUDNN_BATCH_NORM:
+            mode = NormMode_t::BATCH_NORM;
+            break;
+        case CUDNN_GROUP_NORM:
+            mode = NormMode_t::GROUP_NORM;
+            break;
+
+#if (CUDNN_VERSION >= 8906)
+        case CUDNN_RMS_NORM:
+            mode = NormMode_t::RMS_NORM;
+            break;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            break;
+#endif
+    }
+}
+
+// To be deprecated. Only exists as setNormFwdPhase(cudnnBackendNormFwdPhase_t) requires it.
+static inline void
+convert_from_cudnn_type(cudnnBackendNormFwdPhase_t const cudnn_mode, cudnn_frontend::NormFwdPhase_t& mode) {
+    mode = NormFwdPhase_t::NOT_SET;
+    switch (cudnn_mode) {
+        case CUDNN_NORM_FWD_INFERENCE:
+            mode = NormFwdPhase_t::INFERENCE;
+            break;
+        case CUDNN_NORM_FWD_TRAINING:
+            mode = NormFwdPhase_t::TRAINING;
+            break;
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            break;
+#endif
+    }
+}
+
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::TensorReordering_t const mode, cudnnBackendTensorReordering_t& cudnn_mode) {
+    switch (mode) {
+        case cudnn_frontend::TensorReordering_t::NONE:
+            cudnn_mode = CUDNN_TENSOR_REORDERING_NONE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::TensorReordering_t::INT8x32:
+            cudnn_mode = CUDNN_TENSOR_REORDERING_INT8x32;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case cudnn_frontend::TensorReordering_t::F16x16:
+#if CUDNN_VERSION >= 8800
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+            if (get_backend_version() >= 8800) {
+                cudnn_mode = CUDNN_TENSOR_REORDERING_F16x16;
+                return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+            } else if (get_backend_version() >= 8700) {
+                cudnn_mode = CUDNN_TENSOR_REORDERING_NONE;
+                return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+            } else {
+                return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+            }
+#endif
+            cudnn_mode = CUDNN_TENSOR_REORDERING_F16x16;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#elif CUDNN_VERSION >= 8700
+            cudnn_mode = CUDNN_TENSOR_REORDERING_NONE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#else
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+        case cudnn_frontend::TensorReordering_t::F8_128x4:
+#if CUDNN_VERSION >= 90700
+            cudnn_mode = CUDNN_TENSOR_REORDERING_F8_128x4;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+// To be deprecated. Only exists as setReorderType(cudnnBackendTensorReordering_t) requires it.
+static inline void
+convert_from_cudnn_type(cudnnBackendTensorReordering_t const cudnn_mode, cudnn_frontend::TensorReordering_t& mode) {
+    mode = cudnn_frontend::TensorReordering_t::NONE;
+    switch (cudnn_mode) {
+        case CUDNN_TENSOR_REORDERING_INT8x32:
+            mode = cudnn_frontend::TensorReordering_t::INT8x32;
+            break;
+#if CUDNN_VERSION >= 8800
+        case CUDNN_TENSOR_REORDERING_F16x16:
+            mode = cudnn_frontend::TensorReordering_t::F16x16;
+            break;
+#endif
+#if CUDNN_VERSION >= 90700
+        case CUDNN_TENSOR_REORDERING_F8_128x4:
+            mode = cudnn_frontend::TensorReordering_t::F8_128x4;
+            break;
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            break;
+#endif
+    }
+}
+
+// To be deprecated. Only exists as OperationBuilder_v8(::cudnnBackendDescriptorType_t mode) requires it.
+static inline cudnn_frontend::DescriptorType_t
+convert_from_cudnn_type(cudnnBackendDescriptorType_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_BACKEND_POINTWISE_DESCRIPTOR:
+            return DescriptorType_t::POINTWISE_DESCRIPTOR;
+        case CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR:
+            return DescriptorType_t::CONVOLUTION_DESCRIPTOR;
+        case CUDNN_BACKEND_ENGINE_DESCRIPTOR:
+            return DescriptorType_t::ENGINE_DESCRIPTOR;
+        case CUDNN_BACKEND_ENGINECFG_DESCRIPTOR:
+            return DescriptorType_t::ENGINECFG_DESCRIPTOR;
+        case CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR:
+            return DescriptorType_t::ENGINEHEUR_DESCRIPTOR;
+        case CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR:
+            return DescriptorType_t::EXECUTION_PLAN_DESCRIPTOR;
+        case CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR:
+            return DescriptorType_t::INTERMEDIATE_INFO_DESCRIPTOR;
+        case CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR:
+            return DescriptorType_t::KNOB_CHOICE_DESCRIPTOR;
+        case CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR:
+            return DescriptorType_t::KNOB_INFO_DESCRIPTOR;
+        case CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR:
+            return DescriptorType_t::LAYOUT_INFO_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_POINTWISE_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_GEN_STATS_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR:
+            return DescriptorType_t::OPERATIONGRAPH_DESCRIPTOR;
+        case CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR:
+            return DescriptorType_t::VARIANT_PACK_DESCRIPTOR;
+        case CUDNN_BACKEND_TENSOR_DESCRIPTOR:
+            return DescriptorType_t::TENSOR_DESCRIPTOR;
+        case CUDNN_BACKEND_MATMUL_DESCRIPTOR:
+            return DescriptorType_t::MATMUL_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_MATMUL_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR;
+        case CUDNN_BACKEND_REDUCTION_DESCRIPTOR:
+            return DescriptorType_t::REDUCTION_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR;
+        case CUDNN_BACKEND_RESAMPLE_DESCRIPTOR:
+            return DescriptorType_t::RESAMPLE_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_CONCAT_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_SIGNAL_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR;
+#if (CUDNN_VERSION >= 8600)
+        case CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR;
+#endif
+#if (CUDNN_VERSION >= 8700)
+        case CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR;
+        case CUDNN_BACKEND_RNG_DESCRIPTOR:
+            return DescriptorType_t::RNG_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_RNG_DESCRIPTOR;
+#endif
+
+#if (CUDNN_VERSION >= 90500)
+        case CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR;
+#endif
+#if (CUDNN_VERSION >= 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+        case CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR;
+        case CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR;
+#endif
+#if (CUDNN_VERSION >= 91500)
+        case CUDNN_BACKEND_OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR:
+            return DescriptorType_t::OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return DescriptorType_t::NOT_SET;
+            break;
+#endif
+    }
+    return DescriptorType_t::NOT_SET;
+}
+
+// To be deprecated. Only exists as setPointwiseMode(cudnnPointwiseMode_t mode) requires it.
+static inline cudnn_frontend::PointwiseMode_t
+convert_from_cudnn_type(cudnnPointwiseMode_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_POINTWISE_ADD:
+            return PointwiseMode_t::ADD;
+        case CUDNN_POINTWISE_MUL:
+            return PointwiseMode_t::MUL;
+        case CUDNN_POINTWISE_SQRT:
+            return PointwiseMode_t::SQRT;
+        case CUDNN_POINTWISE_MAX:
+            return PointwiseMode_t::MAX;
+        case CUDNN_POINTWISE_MIN:
+            return PointwiseMode_t::MIN;
+        case CUDNN_POINTWISE_RELU_FWD:
+            return PointwiseMode_t::RELU_FWD;
+        case CUDNN_POINTWISE_TANH_FWD:
+            return PointwiseMode_t::TANH_FWD;
+        case CUDNN_POINTWISE_SIGMOID_FWD:
+            return PointwiseMode_t::SIGMOID_FWD;
+        case CUDNN_POINTWISE_ELU_FWD:
+            return PointwiseMode_t::ELU_FWD;
+        case CUDNN_POINTWISE_GELU_FWD:
+            return PointwiseMode_t::GELU_FWD;
+        case CUDNN_POINTWISE_SOFTPLUS_FWD:
+            return PointwiseMode_t::SOFTPLUS_FWD;
+        case CUDNN_POINTWISE_SWISH_FWD:
+            return PointwiseMode_t::SWISH_FWD;
+        case CUDNN_POINTWISE_RELU_BWD:
+            return PointwiseMode_t::RELU_BWD;
+        case CUDNN_POINTWISE_TANH_BWD:
+            return PointwiseMode_t::TANH_BWD;
+        case CUDNN_POINTWISE_SIGMOID_BWD:
+            return PointwiseMode_t::SIGMOID_BWD;
+        case CUDNN_POINTWISE_ELU_BWD:
+            return PointwiseMode_t::ELU_BWD;
+        case CUDNN_POINTWISE_GELU_BWD:
+            return PointwiseMode_t::GELU_BWD;
+        case CUDNN_POINTWISE_SOFTPLUS_BWD:
+            return PointwiseMode_t::SOFTPLUS_BWD;
+        case CUDNN_POINTWISE_SWISH_BWD:
+            return PointwiseMode_t::SWISH_BWD;
+        case CUDNN_POINTWISE_DIV:
+            return PointwiseMode_t::DIV;
+        case CUDNN_POINTWISE_ADD_SQUARE:
+            return PointwiseMode_t::ADD_SQUARE;
+        case CUDNN_POINTWISE_EXP:
+            return PointwiseMode_t::EXP;
+        case CUDNN_POINTWISE_SUB:
+            return PointwiseMode_t::SUB;
+        case CUDNN_POINTWISE_CMP_EQ:
+            return PointwiseMode_t::CMP_EQ;
+        case CUDNN_POINTWISE_CMP_NEQ:
+            return PointwiseMode_t::CMP_NEQ;
+        case CUDNN_POINTWISE_CMP_GT:
+            return PointwiseMode_t::CMP_GT;
+        case CUDNN_POINTWISE_CMP_GE:
+            return PointwiseMode_t::CMP_GE;
+        case CUDNN_POINTWISE_CMP_LT:
+            return PointwiseMode_t::CMP_LT;
+        case CUDNN_POINTWISE_CMP_LE:
+            return PointwiseMode_t::CMP_LE;
+        case CUDNN_POINTWISE_LOGICAL_AND:
+            return PointwiseMode_t::LOGICAL_AND;
+        case CUDNN_POINTWISE_LOGICAL_OR:
+            return PointwiseMode_t::LOGICAL_OR;
+        case CUDNN_POINTWISE_LOGICAL_NOT:
+            return PointwiseMode_t::LOGICAL_NOT;
+        case CUDNN_POINTWISE_LOG:
+            return PointwiseMode_t::LOG;
+        case CUDNN_POINTWISE_NEG:
+            return PointwiseMode_t::NEG;
+        case CUDNN_POINTWISE_MOD:
+            return PointwiseMode_t::MOD;
+        case CUDNN_POINTWISE_POW:
+            return PointwiseMode_t::POW;
+        case CUDNN_POINTWISE_ABS:
+            return PointwiseMode_t::ABS;
+        case CUDNN_POINTWISE_CEIL:
+            return PointwiseMode_t::CEIL;
+        case CUDNN_POINTWISE_COS:
+            return PointwiseMode_t::COS;
+        case CUDNN_POINTWISE_FLOOR:
+            return PointwiseMode_t::FLOOR;
+        case CUDNN_POINTWISE_RSQRT:
+            return PointwiseMode_t::RSQRT;
+        case CUDNN_POINTWISE_SIN:
+            return PointwiseMode_t::SIN;
+        case CUDNN_POINTWISE_TAN:
+            return PointwiseMode_t::TAN;
+        case CUDNN_POINTWISE_GEN_INDEX:
+            return PointwiseMode_t::GEN_INDEX;
+        case CUDNN_POINTWISE_BINARY_SELECT:
+            return PointwiseMode_t::BINARY_SELECT;
+        case CUDNN_POINTWISE_ERF:
+            return PointwiseMode_t::ERF;
+        case CUDNN_POINTWISE_IDENTITY:
+            return PointwiseMode_t::IDENTITY;
+        case CUDNN_POINTWISE_GELU_APPROX_TANH_BWD:
+            return PointwiseMode_t::GELU_APPROX_TANH_BWD;
+        case CUDNN_POINTWISE_GELU_APPROX_TANH_FWD:
+            return PointwiseMode_t::GELU_APPROX_TANH_FWD;
+#if (CUDNN_VERSION >= 8900)
+        case CUDNN_POINTWISE_RECIPROCAL:
+            return PointwiseMode_t::RECIPROCAL;
+#endif
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return PointwiseMode_t::NOT_SET;
+#endif
+    }
+    return PointwiseMode_t::NOT_SET;
+}
+
+// To be deprecated. Only exists as setDataType(cudnnDataType_t mode) requires it.
+static inline cudnn_frontend::DataType_t
+convert_from_cudnn_type(cudnnDataType_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_DATA_FLOAT:
+            return DataType_t::FLOAT;
+        case CUDNN_DATA_DOUBLE:
+            return DataType_t::DOUBLE;
+        case CUDNN_DATA_HALF:
+            return DataType_t::HALF;
+        case CUDNN_DATA_INT8:
+            return DataType_t::INT8;
+        case CUDNN_DATA_INT32:
+            return DataType_t::INT32;
+        case CUDNN_DATA_INT8x4:
+            return DataType_t::INT8x4;
+        case CUDNN_DATA_UINT8:
+            return DataType_t::UINT8;
+        case CUDNN_DATA_UINT8x4:
+            return DataType_t::UINT8x4;
+        case CUDNN_DATA_INT8x32:
+            return DataType_t::INT8x32;
+        case CUDNN_DATA_BFLOAT16:
+            return DataType_t::BFLOAT16;
+        case CUDNN_DATA_INT64:
+            return DataType_t::INT64;
+        case CUDNN_DATA_BOOLEAN:
+            return DataType_t::BOOLEAN;
+#if (CUDNN_VERSION >= 8600)
+        case CUDNN_DATA_FP8_E4M3:
+            return DataType_t::FP8_E4M3;
+        case CUDNN_DATA_FP8_E5M2:
+            return DataType_t::FP8_E5M2;
+#endif
+#if (CUDNN_VERSION >= 8700)
+        case CUDNN_DATA_FAST_FLOAT_FOR_FP8:
+            return DataType_t::FAST_FLOAT_FOR_FP8;
+#endif
+#if (CUDNN_VERSION >= 90700)
+        case CUDNN_DATA_FP8_E8M0:
+            return DataType_t::FP8_E8M0;
+#endif
+#if (CUDNN_VERSION >= 90700)
+        case CUDNN_DATA_FP4_E2M1:
+            return DataType_t::FP4_E2M1;
+#endif
+#if (CUDNN_VERSION >= 91100)
+        case CUDNN_DATA_INT4:
+            return DataType_t::INT4;
+#endif
+#if (CUDNN_VERSION >= 91400)
+        case CUDNN_DATA_COMPLEX_FP32:
+            return DataType_t::COMPLEX_FP32;
+        case CUDNN_DATA_COMPLEX_FP64:
+            return DataType_t::COMPLEX_FP64;
+#endif
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return DataType_t::NOT_SET;
+#endif
+    }
+    return DataType_t::NOT_SET;
+}
+
+static size_t
+get_element_size_in_bits(cudnn_frontend::DataType_t datatype) {
+    switch (datatype) {
+        case DataType_t::INT8x32:
+            return 256;
+            break;
+#if (CUDNN_VERSION >= 91400)
+        case DataType_t::COMPLEX_FP64:
+            return 128;
+            break;
+#endif
+        case DataType_t::DOUBLE:
+        case DataType_t::INT64:
+#if (CUDNN_VERSION >= 91400)
+        case DataType_t::COMPLEX_FP32:
+#endif
+            return 64;
+            break;
+        case DataType_t::FLOAT:
+        case DataType_t::INT32:
+        case DataType_t::INT8x4:
+        case DataType_t::UINT8x4:
+            return 32;
+            break;
+        case DataType_t::HALF:
+        case DataType_t::BFLOAT16:
+            return 16;
+            break;
+        case DataType_t::INT8:
+        case DataType_t::UINT8:
+#if (CUDNN_VERSION >= 8600)
+        case DataType_t::FP8_E4M3:
+        case DataType_t::FP8_E5M2:
+#endif
+#if (CUDNN_VERSION >= 8700)
+        case DataType_t::FAST_FLOAT_FOR_FP8:
+#endif
+#if (CUDNN_VERSION >= 90700)
+        case DataType_t::FP8_E8M0:
+#endif
+            return 8;
+            break;
+#if (CUDNN_VERSION >= 90700)
+        case DataType_t::FP4_E2M1:
+#if (CUDNN_VERSION >= 91100)
+        case DataType_t::INT4:
+#endif
+            return 4;
+#endif
+        case DataType_t::BOOLEAN:
+            return 1;
+            break;
+        default:
+            return 0;
+            break;
+    }
+}
+
+// To be deprecated. Only exists as setReductionOp(cudnnReduceTensorOp_t mode) requires it.
+static inline cudnn_frontend::ReductionMode_t
+convert_from_cudnn_type(cudnnReduceTensorOp_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_REDUCE_TENSOR_ADD:
+            return ReductionMode_t::ADD;
+        case CUDNN_REDUCE_TENSOR_MUL:
+            return ReductionMode_t::MUL;
+        case CUDNN_REDUCE_TENSOR_MIN:
+            return ReductionMode_t::MIN;
+        case CUDNN_REDUCE_TENSOR_MAX:
+            return ReductionMode_t::MAX;
+        case CUDNN_REDUCE_TENSOR_AMAX:
+            return ReductionMode_t::AMAX;
+        case CUDNN_REDUCE_TENSOR_AVG:
+            return ReductionMode_t::AVG;
+        case CUDNN_REDUCE_TENSOR_NORM1:
+            return ReductionMode_t::NORM1;
+        case CUDNN_REDUCE_TENSOR_NORM2:
+            return ReductionMode_t::NORM2;
+        case CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS:
+            return ReductionMode_t::MUL_NO_ZEROS;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return ReductionMode_t::NOT_SET;
+#endif
+    }
+    return ReductionMode_t::NOT_SET;
+}
+
+#if (CUDNN_VERSION >= 8700)
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::RngDistribution_t const mode, cudnnRngDistribution_t& cudnn_mode) {
+    NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+
+    switch (mode) {
+        case RngDistribution_t::BERNOULLI:
+            cudnn_mode = CUDNN_RNG_DISTRIBUTION_BERNOULLI;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case RngDistribution_t::UNIFORM:
+            cudnn_mode = CUDNN_RNG_DISTRIBUTION_UNIFORM;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case RngDistribution_t::NORMAL:
+            cudnn_mode = CUDNN_RNG_DISTRIBUTION_NORMAL;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+// To be deprecated. Only exists as setRngDistribution(cudnnRngDistribution_t mode) requires it.
+static inline cudnn_frontend::RngDistribution_t
+convert_from_cudnn_type(cudnnRngDistribution_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_RNG_DISTRIBUTION_BERNOULLI:
+            return RngDistribution_t::BERNOULLI;
+        case CUDNN_RNG_DISTRIBUTION_UNIFORM:
+            return RngDistribution_t::UNIFORM;
+        case CUDNN_RNG_DISTRIBUTION_NORMAL:
+            return RngDistribution_t::NORMAL;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return RngDistribution_t::NOT_SET;
+#endif
+    }
+    return RngDistribution_t::NOT_SET;
+}
+#endif
+
+#if (CUDNN_VERSION >= 91500)
+static inline cudnnStatus_t
+convert_to_cudnn_type(cudnn_frontend::MoeGroupedMatmulMode_t const mode, cudnnMoeGroupedMatmulMode_t& cudnn_mode) {
+    NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(91500, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+    switch (mode) {
+        case MoeGroupedMatmulMode_t::NONE:
+            cudnn_mode = CUDNN_MOE_GROUPED_MATMUL_MODE_NONE;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case MoeGroupedMatmulMode_t::GATHER:
+            cudnn_mode = CUDNN_MOE_GROUPED_MATMUL_MODE_GATHER;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+        case MoeGroupedMatmulMode_t::SCATTER:
+            cudnn_mode = CUDNN_MOE_GROUPED_MATMUL_MODE_SCATTER;
+            return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+#endif
+    }
+    return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+}
+
+static inline cudnn_frontend::MoeGroupedMatmulMode_t
+convert_from_cudnn_type(cudnnMoeGroupedMatmulMode_t const cudnn_mode) {
+    switch (cudnn_mode) {
+        case CUDNN_MOE_GROUPED_MATMUL_MODE_NONE:
+            return MoeGroupedMatmulMode_t::NONE;
+        case CUDNN_MOE_GROUPED_MATMUL_MODE_GATHER:
+            return MoeGroupedMatmulMode_t::GATHER;
+        case CUDNN_MOE_GROUPED_MATMUL_MODE_SCATTER:
+            return MoeGroupedMatmulMode_t::SCATTER;
+#ifndef NO_DEFAULT_IN_SWITCH
+        default:
+            return MoeGroupedMatmulMode_t::NOT_SET;
+#endif
+    }
+    return MoeGroupedMatmulMode_t::NOT_SET;
+}
+#endif
+
+std::string static get_engine_tag(ManagedOpaqueDescriptor const config) {
+    std::stringstream tag{""};
+    ManagedOpaqueDescriptor extractedEngine = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
+    auto status                             = extractedEngine->get_status();
+
+    cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
+    int64_t elemCount                         = 0;
+    status                                    = detail::get_attribute(config->get_backend_descriptor(),
+                                   CUDNN_ATTR_ENGINECFG_ENGINE,
+                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                   1,
+                                   &elemCount,
+                                   &extractedEngine_);
+    if (status != CUDNN_STATUS_SUCCESS) {
+        return "INVALID_ENGINE_NAME_CFG";
+    }
+
+    int64_t engineId = 0, numKnobs = 0;
+
+    std::array<ManagedOpaqueDescriptor, CUDNN_KNOB_TYPE_COUNTS> extractedKnobs{{nullptr}};
+    for (auto& knob : extractedKnobs) {
+        knob   = make_shared_backend_pointer(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR);
+        status = knob->get_status();
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return "INVALID_ENGINE_NAME_KNOB";
+        }
+    }
+
+    std::array<cudnnBackendDescriptor_t, CUDNN_KNOB_TYPE_COUNTS> extractedKnobs_{{nullptr}};
+    for (std::uint32_t i = 0; i < extractedKnobs.size(); i++) {
+        extractedKnobs_[i] = extractedKnobs[i]->get_backend_descriptor();
+    }
+
+    status = detail::get_attribute(
+        extractedEngine_, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &elemCount, &engineId);
+    if (status != CUDNN_STATUS_SUCCESS) {
+        return "INVALID_ENGINE_NAME_IDX";
+    }
+    tag << "eng" << engineId;
+
+    status = detail::get_attribute(config->get_backend_descriptor(),
+                                   CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                   CUDNN_KNOB_TYPE_COUNTS,
+                                   &numKnobs,
+                                   &(extractedKnobs_[0]));
+    if (status != CUDNN_STATUS_SUCCESS) {
+        return "INVALID_ENGINE_NAME_KNOB_QUERY";
+    }
+    if (numKnobs > CUDNN_KNOB_TYPE_COUNTS) {
+        return "INVALID_ENGINE_NAME_KNOB_COUNT";
+    }
+
+    for (size_t idx = 0; idx < static_cast<size_t>(numKnobs); ++idx) {
+        const cudnnBackendDescriptor_t& knob = extractedKnobs_[idx];
+        cudnnBackendKnobType_t type          = CUDNN_KNOB_TYPE_COUNTS;
+        int64_t choice                       = -2;
+        status = detail::get_attribute(knob, CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, nullptr, &type);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return "INVALID_ENGINE_NAME_KNOB_CHOICE_KNOB_TYPE";
+        }
+        status = detail::get_attribute(knob, CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE, CUDNN_TYPE_INT64, 1, nullptr, &choice);
+        if (status != CUDNN_STATUS_SUCCESS) {
+            return "INVALID_ENGINE_NAME_KNOB_CHOICE_KNOB_VALUE";
+        }
+        tag << "_k" << type << "=" << choice;
+    }
+    return tag.str();
+}
+
+}  // namespace detail
+
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/include/cudnn_frontend_version.h b/third_party/cudnn-frontend/include/cudnn_frontend_version.h
new file mode 100644
index 00000000..53e9afd7
--- /dev/null
+++ b/third_party/cudnn-frontend/include/cudnn_frontend_version.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#define CUDNN_FRONTEND_MAJOR_VERSION 1
+#define CUDNN_FRONTEND_MINOR_VERSION 18
+#define CUDNN_FRONTEND_PATCH_VERSION 0
+#define CUDNN_FRONTEND_VERSION \
+    ((CUDNN_FRONTEND_MAJOR_VERSION * 10000) + (CUDNN_FRONTEND_MINOR_VERSION * 100) + CUDNN_FRONTEND_PATCH_VERSION)
diff --git a/third_party/cudnn-frontend/pyproject.toml b/third_party/cudnn-frontend/pyproject.toml
new file mode 100644
index 00000000..d1a34408
--- /dev/null
+++ b/third_party/cudnn-frontend/pyproject.toml
@@ -0,0 +1,36 @@
+[build-system]
+requires = ["setuptools>=64", "cmake>=3.18", "ninja==1.11.1.1", "pybind11[global]"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "nvidia-cudnn-frontend"
+dynamic = ["version"]
+description = "CUDNN FrontEnd python library"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "NVIDIA Proprietary Software"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+]
+
+[tool.setuptools]
+packages = ["cudnn", "include", "cudnn.native_sparse_attention", "cudnn.native_sparse_attention.selection", "cudnn.native_sparse_attention.compression", "cudnn.native_sparse_attention.sliding_window_attention", "cudnn.native_sparse_attention.top_k", "cudnn.gemm_swiglu", "cudnn.gemm_amax", "cudnn.grouped_gemm", "cudnn.grouped_gemm.grouped_gemm_swiglu"]
+package-dir = {"" = "python", "include" = "include"}
+include-package-data = true
+
+[project.urls]
+"Homepage" = "https://github.com/nvidia/cudnn-frontend"
+"Bug Tracker" = "https://github.com/nvidia/cudnn-frontend/issues"
+
+[tool.setuptools.dynamic]
+version = {attr = "cudnn.__version__"}
+
+[tool.setuptools.package-data]
+include = ["**/*"]
+
+[project.optional-dependencies]
+cutedsl = [
+    "nvidia-cutlass-dsl==4.3.5",
+    "cuda-python",
+    "torch",
+]
diff --git a/third_party/cudnn-frontend/python/CMakeLists.txt b/third_party/cudnn-frontend/python/CMakeLists.txt
new file mode 100644
index 00000000..80836a14
--- /dev/null
+++ b/third_party/cudnn-frontend/python/CMakeLists.txt
@@ -0,0 +1,127 @@
+cmake_minimum_required(VERSION 3.18)
+
+Include(FetchContent)
+
+# Fetch and build dlpack
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+set(CMAKE_POLICY_VERSION_MINIMUM 3.18)
+set(BUILD_MOCK OFF)
+
+option(CUDNN_FRONTEND_USE_SYSTEM_DLPACK "Whether dlpack should use the system version or fetched." OFF)
+
+IF(CUDNN_FRONTEND_USE_SYSTEM_DLPACK)
+  find_package(dlpack REQUIRED)
+  if(dlpack_FOUND)
+    message(STATUS "Found system dlpack")
+  else()
+    message(FATAL_ERROR "dlpack not found")
+  endif()
+else()
+
+  # Read dlpack version from dlpack_version.txt
+  file(READ ${PROJECT_SOURCE_DIR}/dlpack_version.txt DLPACK_VERSION)
+  string(STRIP "${DLPACK_VERSION}" DLPACK_VERSION)
+
+  FetchContent_Declare(
+    dlpack
+    GIT_REPOSITORY https://github.com/dmlc/dlpack
+    GIT_TAG        v${DLPACK_VERSION}
+  )
+  FetchContent_MakeAvailable(dlpack)
+endif()
+
+# Find python
+find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+
+# Find cudnn
+include(${PROJECT_SOURCE_DIR}/cmake/cuDNN.cmake)
+
+option(CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE "Whether cmake build system should fetch pybinds." ON)
+
+
+if(CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE)
+    FetchContent_Declare(
+        pybind11
+        GIT_REPOSITORY https://github.com/pybind/pybind11
+        GIT_TAG        v2.13.6
+    )
+    FetchContent_MakeAvailable(pybind11)
+else()
+    find_package(pybind11 CONFIG REQUIRED)
+endif()
+
+
+# Add a library using FindPython's tooling (pybind11 also provides a helper like
+# this)
+python_add_library(
+    _compiled_module
+    
+    MODULE
+    pycudnn.cpp
+    properties.cpp
+
+    pygraph/pygraph.cpp
+    pygraph/norm.cpp
+    pygraph/sdpa.cpp
+    pygraph/pointwise.cpp
+
+    WITH_SOABI
+)
+target_link_libraries(_compiled_module PRIVATE pybind11::headers)
+
+target_compile_features(_compiled_module PRIVATE cxx_std_20)
+
+target_include_directories(
+    _compiled_module
+    PRIVATE $<TARGET_PROPERTY:cudnn_frontend,INTERFACE_INCLUDE_DIRECTORIES>
+    PRIVATE $<TARGET_PROPERTY:CUDNN::cudnn_all,INTERFACE_INCLUDE_DIRECTORIES>
+)
+
+target_compile_definitions(_compiled_module PRIVATE NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)
+
+if(CUDNN_FRONTEND_USE_SYSTEM_DLPACK)
+    target_link_libraries(
+        _compiled_module
+
+        PRIVATE dlpack::dlpack
+    )
+else()
+    target_link_libraries(
+        _compiled_module
+
+        PRIVATE dlpack
+    )
+endif()
+
+if(MSVC)
+    target_compile_options(_compiled_module PRIVATE /wd4127)
+    target_compile_options(_compiled_module PRIVATE /wd4244)
+    target_compile_options(_compiled_module PRIVATE /wd4458)
+    target_compile_options(_compiled_module PRIVATE /wd4505)
+    target_compile_options(_compiled_module PRIVATE /bigobj)
+else()
+    add_compile_options(-fvisibility=hidden)
+endif()
+
+set_target_properties(
+    _compiled_module
+
+    PROPERTIES
+    LINK_FLAGS "-Wl,--no-as-needed"
+    LINK_FLAGS "-Wl,--enable-new-dtags"
+    LINK_FLAGS "-Wl,-rpath,'$ORIGIN',-rpath,'$ORIGIN/../lib',-rpath,'$ORIGIN/../nvidia/cudnn/lib'"
+    LINK_WHAT_YOU_USE TRUE
+)
+
+# using python bindings directly with cmake build system is not supported
+# Temparorily use below parameter
+option(CUDNN_FRONTEND_KEEP_PYBINDS_IN_BINARY_DIR "Whether pybinds should be kept inside build/cudnn directory." ON)
+if(CUDNN_FRONTEND_KEEP_PYBINDS_IN_BINARY_DIR)
+set_target_properties(
+        _compiled_module
+
+        PROPERTIES
+        LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/cudnn
+    )
+    file(COPY ${PROJECT_SOURCE_DIR}/python/cudnn DESTINATION ${PROJECT_BINARY_DIR})
+endif()
diff --git a/third_party/cudnn-frontend/python/cudnn/README.md b/third_party/cudnn-frontend/python/cudnn/README.md
new file mode 100644
index 00000000..b6e85542
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/README.md
@@ -0,0 +1,46 @@
+# cuDNN Python Frontend
+
+This folder exposes the Python Frontend Graph APIs and the high-level Graph wrapper, along with several frontend-only, ready-to-use APIs.
+
+- **Graph API**: Low-level primitives for building, compiling, and executing cuDNN operation graphs in Python.
+- **Graph Wrapper (`Graph`)**: A convenience layer that reduces boilerplate, manages workspace and tensor mapping, and makes execution ergonomic.
+- **Frontend-only APIs**: Individual turnkey kernels with Python-first APIs
+
+## Directory Structure
+
+A simplified view of package structure:
+
+```
+pyproject.toml                       # Project metadata and dependencies. Optional dependencies for frontend-only APIs are registered here.
+python/cudnn/
+├── __init__.py                     # Top-level exports (Graph, graph, jit, wrappers, kernels)
+├── graph.py                        # Low-level graph helpers (graph, jit, graph_cache)
+├── wrapper.py                      # High-level Graph wrapper class
+├── datatypes.py                    # Data type conversions and helpers
+├── api_base.py                     # Abstract API base class for frontend-only APIs
+├── {frontend-only-api-name}/
+│   ├── __init__.py                 # Frontend-only API class
+│   └── api.py                      # High-level API implementation
+│   └── {kernel_name}.py            # Kernel implementation, i.e CuteDSL
+test/python/                        # Test files
+└── fe_api/                         # Test files for frontend-only APIs
+```
+
+## 
+
+## Adding new frontend-only APIs
+
+To add a new frontend-only API, follow these steps:
+1. Create a new directory in the `python/cudnn` directory with the name of the API.
+2. Add your kernel implementation and implement the high level API implementation in `api.py`, extending the `APIBase` class in `api_base.py`.
+3. Expose the API import in `python/cudnn/__init__.py` and register the folder in `pyproject.toml`. Register any optional dependences if required.
+4. Add a sample usage/test file in `test/python/fe_api/`.
+
+**Currently implemented frontend-only APIs**:
+- `GEMM + Amax`
+- `GEMM + SwiGLU`
+
+**In progress frontend-only APIs**:
+- GEMM + Dswiglu
+- GEMM + RoPE
+- Native Sparse Attention (NSA)
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/python/cudnn/__init__.py b/third_party/cudnn-frontend/python/cudnn/__init__.py
new file mode 100644
index 00000000..2e42a9a5
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/__init__.py
@@ -0,0 +1,310 @@
+import ctypes
+import glob
+import os
+import sys
+import sysconfig
+import importlib
+
+
+def is_windows():
+    return sys.platform.startswith("win")
+
+
+module_name = ".Release._compiled_module" if is_windows() else "._compiled_module"
+
+_pybind_module = importlib.import_module(module_name, package=__name__)
+
+symbols_to_import = [
+    "backend_version",
+    "backend_version_string",
+    "get_last_error_string",
+    "destroy_handle",
+    "norm_forward_phase",
+    "reduction_mode",
+    "behavior_note",
+    "knob_type",
+    "create_handle",
+    "create_kernel_cache",
+    "create_device_properties",
+    "get_stream",
+    "numerical_note",
+    "set_stream",
+    "build_plan_policy",
+    "data_type",
+    "tensor_reordering",
+    "heur_mode",
+    "pygraph",
+    "tensor",
+    "knob",
+    "cudnnGraphNotSupportedError",
+    "diagonal_alignment",
+    "attention_implementation",
+]
+
+for symbol_name in symbols_to_import:
+    globals()[symbol_name] = getattr(_pybind_module, symbol_name)
+
+from .datatypes import _library_type, _is_torch_tensor
+
+__version__ = "1.18.0"
+
+
+def _tensor(
+    self,
+    dim,
+    stride,
+    data_type=data_type.NOT_SET,
+    is_virtual=False,
+    is_pass_by_value=False,
+    ragged_offset=None,
+    reordering_type=tensor_reordering.NONE,
+    name="",
+    uid=-1,
+):
+    """
+    Create a tensor.
+
+    Args:
+        dim (List[int]): The dimensions of the tensor.
+        stride (List[int]): The strides of the tensor.
+        data_type (cudnn.data_type): The data type of the tensor.
+        is_virtual (bool): Flag indicating if the tensor is virtual.
+        is_pass_by_value (bool): Flag indicating if the tensor is passed by value.
+        ragged_offset (cudnn_tensor): The ragged offset tensor.
+        reordering_type (cudnn.tensor_reordering): The reordering type of the tensor.
+        name (str): The name of the tensor.
+
+    Returns:
+        cudnn_tensor: The created tensor.
+    """
+    return self._make_tensor(
+        dim=dim,
+        stride=stride,
+        data_type=_library_type(data_type),
+        is_virtual=is_virtual,
+        is_pass_by_value=is_pass_by_value,
+        ragged_offset=ragged_offset,
+        reordering_type=reordering_type,
+        name=name,
+        uid=uid,
+    )
+
+
+def _set_data_type(
+    self,
+    data_type=data_type.NOT_SET,
+):
+    return self._set_data_type(_library_type(data_type))
+
+
+_pybind_module.tensor.set_data_type = _set_data_type
+pygraph.tensor = _tensor
+
+
+def _library_device_pointer(input_tensor):
+    # either pass in pointers directly
+    if type(input_tensor) is int:
+        return input_tensor
+    # directly extract data pointer for torch tensors
+    elif _is_torch_tensor(input_tensor):
+        return input_tensor.data_ptr()
+    # fall back to dlpack support by library
+    else:
+        return _pybind_module._get_data_ptr(input_tensor)
+
+
+def _execute(
+    self,
+    tensor_to_device_buffer,
+    workspace,
+    handle=None,
+    override_uids=None,
+    override_shapes=None,
+    override_strides=None,
+):
+    """
+    Execute a cudnn graph.
+
+    Args:
+        tensor_to_device_buffer (dict(cudnn_tensor, Union[torch.Tensor, int, __dlpack__])): The dimensions of the tensor.
+        workspace (Union[torch.Tensor, int, __dlpack__]): The name of the tensor.
+        handle: cudnn_handle created with cudnn.create_handle()
+    Returns:
+        None
+    """
+    uid_to_tensor_pointer = {
+        x if type(x) is int else x.get_uid(): _library_device_pointer(pointer) for x, pointer in tensor_to_device_buffer.items() if x is not None
+    }
+
+    workspace_pointer = _library_device_pointer(workspace)
+    self._execute(uid_to_tensor_pointer, workspace_pointer, handle)
+
+
+def _execute_plan_at_index(
+    self,
+    tensor_to_device_buffer,
+    workspace,
+    index,
+    handle=None,
+    override_uids=None,
+    override_shapes=None,
+    override_strides=None,
+):
+    """
+    Execute a cudnn graph.
+
+    Args:
+        tensor_to_device_buffer (dict(cudnn_tensor, Union[torch.Tensor, int, __dlpack__])): The dimensions of the tensor.
+        workspace (Union[torch.Tensor, int, __dlpack__]): The name of the tensor.
+        index(int): Location of execution plan to use.
+        handle: cudnn_handle created with cudnn.create_handle()
+    Returns:
+        None
+    """
+    uid_to_tensor_pointer = {
+        x if type(x) is int else x.get_uid(): _library_device_pointer(pointer) for x, pointer in tensor_to_device_buffer.items() if x is not None
+    }
+
+    workspace_pointer = _library_device_pointer(workspace)
+    self._execute_plan_at_index(
+        uid_to_tensor_pointer,
+        workspace_pointer,
+        index,
+        handle,
+        override_uids,
+        override_shapes,
+        override_strides,
+    )
+
+
+pygraph.execute = _execute
+pygraph.execute_plan_at_index = _execute_plan_at_index
+
+
+def load_cudnn():
+    # First look at python site packages
+    lib_path = glob.glob(os.path.join(sysconfig.get_path("purelib"), "nvidia/cudnn/bin/cudnn64_9.dll"))
+
+    if lib_path:
+        assert len(lib_path) == 1, f"Found {len(lib_path)} libcudnn.dll.x in nvidia-cudnn-cuXX."
+        lib = ctypes.windll.LoadLibrary(lib_path[0])
+    else:  # Fallback
+        lib = ctypes.windll.LoadLibrary("cudnn64_9.dll")
+
+    handle = ctypes.cast(lib._handle, ctypes.c_void_p).value
+    _pybind_module._set_dlhandle_cudnn(handle)
+
+
+def _dlopen_cudnn():
+    # First look at python site packages
+    lib_path = glob.glob(os.path.join(sysconfig.get_path("purelib"), "nvidia/cudnn/lib/libcudnn.so.*[0-9]"))
+
+    if not lib_path:
+        lib_path = glob.glob(os.path.join(sysconfig.get_path("purelib"), "nvidia/cudnn_jit/lib/libcudnn.so.*[0-9]"))
+
+    if lib_path:
+        assert len(lib_path) == 1, f"Found {len(lib_path)} libcudnn.so.x in nvidia-cudnn-cuXX."
+        lib = ctypes.CDLL(lib_path[0])
+    else:  # Fallback
+        try:
+            lib = ctypes.CDLL("libcudnn.so.9")
+        except Exception:
+            try:
+                lib = ctypes.CDLL("libcudnn.so")
+            except Exception:
+                lib = None
+
+    if lib is not None:
+        handle = ctypes.cast(lib._handle, ctypes.c_void_p).value
+        _pybind_module._set_dlhandle_cudnn(handle)
+
+
+if is_windows():
+    load_cudnn()
+else:
+    _dlopen_cudnn()
+
+from .graph import graph, jit, graph_cache
+from .wrapper import Graph
+
+from typing import Any
+
+
+def __getattr__(name: str) -> Any:
+    if name == "NSA":
+        try:
+            from .native_sparse_attention import NSA as _NSA
+
+            return _NSA
+        except Exception as e:
+            raise ImportError(f"NSA requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}") from e
+
+    elif name == "GemmSwigluSm100":
+        try:
+            from .gemm_swiglu import GemmSwigluSm100 as _GemmSwigluSm100
+
+            return _GemmSwigluSm100
+        except Exception as e:
+            raise ImportError(f"GemmSwigluSm100 requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}") from e
+
+    elif name == "gemm_swiglu_wrapper_sm100":
+        try:
+            from .gemm_swiglu import (
+                gemm_swiglu_wrapper_sm100 as _gemm_swiglu_wrapper_sm100,
+            )
+
+            return _gemm_swiglu_wrapper_sm100
+        except Exception as e:
+            raise ImportError(
+                f"gemm_swiglu_wrapper_sm100 requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}"
+            ) from e
+
+    elif name == "GemmAmaxSm100":
+        try:
+            from .gemm_amax import GemmAmaxSm100 as _GemmAmaxSm100
+
+            return _GemmAmaxSm100
+        except Exception as e:
+            raise ImportError(f"GemmAmaxSm100 requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}") from e
+
+    elif name == "gemm_amax_wrapper_sm100":
+        try:
+            from .gemm_amax import (
+                gemm_amax_wrapper_sm100 as _gemm_amax_wrapper_sm100,
+            )
+
+            return _gemm_amax_wrapper_sm100
+        except Exception as e:
+            raise ImportError(f"gemm_amax_wrapper_sm100 requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}") from e
+
+    # Grouped GEMM module
+    elif name == "grouped_gemm":
+        try:
+            from . import grouped_gemm as _grouped_gemm
+
+            return _grouped_gemm
+        except Exception as e:
+            raise ImportError(f"grouped_gemm requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}") from e
+
+    elif name == "GroupedGemmSwigluSm100":
+        try:
+            from .grouped_gemm import GroupedGemmSwigluSm100 as _GroupedGemmSwigluSm100
+
+            return _GroupedGemmSwigluSm100
+        except Exception as e:
+            raise ImportError(f"GroupedGemmSwigluSm100 requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}") from e
+
+    elif name == "grouped_gemm_swiglu_wrapper_sm100":
+        try:
+            from .grouped_gemm import (
+                grouped_gemm_swiglu_wrapper_sm100 as _grouped_gemm_swiglu_wrapper_sm100,
+            )
+
+            return _grouped_gemm_swiglu_wrapper_sm100
+        except Exception as e:
+            raise ImportError(
+                f"grouped_gemm_swiglu_wrapper_sm100 requires optional dependencies. Install with 'pip install nvidia-cudnn-frontend[cutedsl]': {e}"
+            ) from e
+
+    else:
+        raise AttributeError(name)
diff --git a/third_party/cudnn-frontend/python/cudnn/api_base.py b/third_party/cudnn-frontend/python/cudnn/api_base.py
new file mode 100644
index 00000000..7f3e544c
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/api_base.py
@@ -0,0 +1,640 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Base classes for cuDNN API wrappers.
+
+This module provides abstract base classes that define common interfaces
+for cuDNN API wrapper classes, including validation, compilation, and execution patterns.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import Any, List, Tuple, Optional
+import logging
+import cuda.bindings.driver as cuda
+import cutlass
+import torch
+
+import cutlass.cute as cute
+from cudnn.datatypes import _convert_to_cutlass_data_type
+
+
+def ceil_div(a: int, b: int) -> int:
+    return (a + b - 1) // b
+
+
+def is_power_of_2(n: int) -> bool:
+    """Check if n is a power of 2."""
+    return n > 0 and (n & (n - 1)) == 0
+
+
+class APIBase(ABC):
+    """Abstract base class for cuDNN API wrappers.
+
+    This class defines the common interface that all API wrapper implementations
+    should follow, including configuration validation, compilation, and execution.
+
+    Provides common functionality:
+    - Logging via self._logger
+    - Support validation tracking via self._is_supported
+    - Compiled kernel caching via self._compiled_kernel
+    - Stream management helpers
+
+    Subclasses should implement the abstract methods to provide
+    API-specific validation logic and execution behavior.
+
+    Example:
+        >>> class MyKernelAPI(APIBase):
+        ...     def __init__(self, sample_input, sample_output, config):
+        ...         super().__init__()
+        ...         self.sample_input = sample_input
+        ...         self.sample_output = sample_output
+        ...         self.config = config
+        ...         self._kernel = MyKernel
+        ...
+        ...     def check_support(self) -> bool:
+        ...         # Validate inputs and configuration
+        ...         assert self.sample_input.dtype == torch.float32
+        ...         self._is_supported = True
+        ...         return True
+        ...
+        ...     def compile(self, current_stream=None):
+        ...         current_stream = self._get_default_stream(current_stream)
+        ...         self._ensure_support_checked()
+        ...         # Create and compile kernel
+        ...         kernel = self._kernel(self.config)
+        ...         self._compiled_kernel = cute.compile(kernel, ...)
+        ...
+        ...     def execute(self, input_tensor, output_tensor,
+        ...                current_stream=None, skip_compile=False):
+        ...         current_stream = self._get_default_stream(current_stream)
+        ...         if not skip_compile:
+        ...             self._compiled_kernel(input_tensor, output_tensor, current_stream)
+        ...         else:
+        ...             # Direct execution without cached compilation
+        ...             kernel = self._kernel(self.config)
+        ...             kernel(input_tensor, output_tensor, current_stream)
+    """
+
+    def __init__(self):
+        """Initialize the API base.
+
+        Sets up:
+        - self._is_supported: Flag indicating if configuration is validated
+        - self._kernel: Kernel instance
+        - self._compiled_kernel: Cache for compiled kernel
+        - self._logger: Logger instance for this class
+        """
+        self._is_supported = False
+        self._kernel = None
+        self._compiled_kernel = None
+        self._interpret_uint8_as_fp4x2 = False
+        self._logger = logging.getLogger(self.__class__.__name__)
+
+    @abstractmethod
+    def check_support(self) -> bool:
+        """Check if the current configuration is supported by the kernel.
+
+        This method should validate:
+        - Input/output tensor shapes and strides
+        - Data types compatibility
+        - Hardware capabilities (compute capability, memory, etc.)
+        - Configuration parameters (tile sizes, cluster shapes, etc.)
+
+        Implementations should set self._is_supported = True if valid.
+
+        :return: True if the configuration is supported
+        :rtype: bool
+        :raises AssertionError: If a configuration requirement is not met
+
+        Example:
+            >>> def check_support(self) -> bool:
+            ...     self._logger.debug("Checking support")
+            ...     assert self.input.dtype in {torch.float16, torch.float32}
+            ...     assert self.input.shape[0] % 16 == 0, "Shape must be 16-aligned"
+            ...     self._is_supported = True
+            ...     return True
+        """
+        pass
+
+    @abstractmethod
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        """Compile the kernel with the current configuration.
+
+        This method should:
+        1. Ensure support has been checked (use self._ensure_support_checked())
+        2. Get default stream if needed (use self._get_default_stream())
+        3. Create the underlying kernel implementation
+        4. Compile the kernel using cute.compile()
+        5. Cache the compiled kernel in self._compiled_kernel
+
+        :param current_stream: CUDA stream for compilation (optional)
+        :type current_stream: cuda.CUstream or None
+        :raises AssertionError: If the configuration is not supported
+
+        Example:
+            >>> def compile(self, current_stream=None):
+            ...     current_stream = self._get_default_stream(current_stream)
+            ...     self._ensure_support_checked()
+            ...
+            ...     kernel = self._kernel(self.config)
+            ...     self._compiled_kernel = cute.compile(
+            ...         kernel,
+            ...         self.sample_input,
+            ...         self.sample_output,
+            ...         current_stream
+            ...     )
+        """
+        pass
+
+    @abstractmethod
+    def execute(
+        self,
+        *args,
+        current_stream: Optional[cuda.CUstream] = None,
+        skip_compile: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Execute the kernel with the provided inputs.
+
+        This method should handle two execution modes:
+        1. With compiled kernel (skip_compile=False): Use self._compiled_kernel
+        2. Without compiled kernel (skip_compile=True): Create and execute kernel directly (JIT)
+
+        :param args: Positional arguments (typically input/output tensors)
+        :param current_stream: CUDA stream for execution (optional)
+        :type current_stream: cuda.CUstream or None
+        :param skip_compile: If False, use cached compiled kernel;
+                            If True, create and execute kernel directly
+        :type skip_compile: bool
+        :param kwargs: Additional keyword arguments for execution
+        :return: Execution result (if any)
+        :raises AssertionError: If compiled kernel is not available when skip_compile=False
+
+        Example:
+            >>> def execute(self, input_tensor, output_tensor,
+            ...            current_stream=None, skip_compile=False):
+            ...     current_stream = self._get_default_stream(current_stream)
+            ...
+            ...     if not skip_compile:
+            ...         assert self._compiled_kernel is not None, "Kernel not compiled"
+            ...         self._logger.debug("Executing with compiled kernel")
+            ...         self._compiled_kernel(input_tensor, output_tensor, current_stream)
+            ...     else:
+            ...         self._logger.debug("Executing without compiled kernel (JIT)")
+            ...         kernel = self._kernel(self.config)
+            ...         kernel(input_tensor, output_tensor, current_stream)
+        """
+        pass
+
+    def __call__(self, *args, **kwargs) -> Any:
+        """Convenience method to execute the kernel.
+
+        This is a shorthand for calling execute() with skip_compile=True,
+        which bypasses the cached compiled kernel and executes directly.
+        This is useful for one-off executions or when you want to ensure
+        fresh compilation.
+
+        :param args: Positional arguments passed to execute()
+        :param kwargs: Keyword arguments passed to execute()
+        :return: Result from execute()
+
+        Example:
+            >>> api = MyKernelAPI(...)
+            >>> api.check_support()
+            >>> # Direct execution without pre-compilation
+            >>> api(input_tensor, output_tensor)  # Equivalent to execute(..., skip_compile=True)
+        """
+        return self.execute(*args, skip_compile=True, **kwargs)
+
+    def _ensure_support_checked(self) -> None:
+        """Helper to ensure check_support() was called before compilation.
+
+        If check_support() has not been called yet (self._is_supported is False),
+        this method will automatically call it. This prevents compilation
+        with invalid configurations.
+
+        :raises AssertionError: If check_support() returns False or raises
+
+        Example:
+            >>> def compile(self, current_stream=None):
+            ...     self._ensure_support_checked()  # Automatic validation
+            ...     # ... rest of compilation
+        """
+        if not self._is_supported:
+            self._logger.info(f"{self.__class__.__name__}: check_support not previously called, calling now")
+            assert self.check_support(), "Unsupported configuration"
+
+    def _get_default_stream(self, stream: Optional[cuda.CUstream]) -> cuda.CUstream:
+        """Get default CUDA stream if none provided.
+
+        This is a convenience helper to handle optional stream parameters.
+        If a stream is provided, it is returned as-is. If None, the default
+        CUDA stream is returned.
+
+        :param stream: CUDA stream or None
+        :type stream: cuda.CUstream or None
+        :return: CUDA stream (either provided or default)
+        :rtype: cuda.CUstream
+
+        Example:
+            >>> def compile(self, current_stream=None):
+            ...     current_stream = self._get_default_stream(current_stream)
+            ...     # Now current_stream is guaranteed to be a valid stream
+        """
+        if stream is None:
+            self._logger.debug(f"{self.__class__.__name__}: No CUDA stream provided, using default stream")
+            return cutlass.cuda.default_stream()
+        return stream
+
+    def _pad_tensor_to_ndim(
+        self,
+        tensor: Optional[torch.Tensor],
+        ndim: int,
+        name: str,
+    ) -> Optional[torch.Tensor]:
+        """Pad a tensor by unsqueezing at dim -1 until it reaches ndim rank.
+
+        - If tensor is None, returns None.
+        - Unsqueezes at dim -1 until tensor.ndim == ndim.
+        - Logs final reshape for traceability.
+
+        :param tensor: The tensor to pad (or None)
+        :param ndim: Target rank (pad trailing dims until reached)
+        :param name: Logical tensor name for logging
+        :return: The padded tensor (or None)
+        """
+        if (tensor is not None) and (tensor.ndim < ndim):
+            self._logger.info(f"Padding {name} to {ndim}D from {tensor.shape}")
+            for _ in range(ndim - tensor.ndim):
+                tensor = tensor.unsqueeze(-1)
+        return tensor
+
+    def _unpad_tensor_to_ndim(
+        self,
+        tensor: Optional[torch.Tensor],
+        ndim: int,
+        name: str,
+    ) -> Optional[torch.Tensor]:
+        """Unpad a tensor by squeezing at dim -1 until it reaches ndim rank.
+
+        - If tensor is None, returns None.
+        - Squeezes at dim -1 until tensor.ndim == ndim.
+        - Logs final reshape for traceability.
+
+        :param tensor: The tensor to unpad (or None)
+        :param ndim: Target rank (squeeze trailing dims until reached)
+        :param name: Logical tensor name for logging
+        :return: The unpadded tensor (or None)
+        """
+        if (tensor is not None) and (tensor.ndim > ndim):
+            self._logger.info(f"Unpadding {name} from {tensor.shape} to {ndim}D")
+            for _ in range(tensor.ndim - ndim):
+                tensor = tensor.squeeze(-1)
+            if tensor.ndim != ndim:
+                self._logger.critical(f"Unpadding {name} resulted in shape {tensor.shape}, expected {ndim}D")
+        return tensor
+
+    def _is_fp4x2(self, tensor_or_dtype: torch.Tensor | torch.dtype) -> bool:
+        """Check if tensor or dtype is an FP4x2 packed datatype.
+
+        :param tensor_or_dtype: The torch tensor or dtype to check
+        :type tensor_or_dtype: torch.Tensor | torch.dtype
+        :return: True if tensor/dtype is an FP4x2 packed type
+        :rtype: bool
+        """
+        if tensor_or_dtype is None:
+            return False
+        dtype = tensor_or_dtype.dtype if isinstance(tensor_or_dtype, torch.Tensor) else tensor_or_dtype
+        return (dtype == torch.float4_e2m1fn_x2) or (self._interpret_uint8_as_fp4x2 and dtype == torch.uint8)
+
+    def _is_fp8(self, tensor_or_dtype: torch.Tensor | torch.dtype) -> bool:
+        """Check if tensor or dtype is an FP8 datatype.
+
+        :param tensor_or_dtype: The torch tensor or dtype to check
+        :type tensor_or_dtype: torch.Tensor | torch.dtype
+        :return: True if tensor/dtype is an FP8 type
+        :rtype: bool
+        """
+        if tensor_or_dtype is None:
+            return False
+        dtype = tensor_or_dtype.dtype if isinstance(tensor_or_dtype, torch.Tensor) else tensor_or_dtype
+        return dtype in {torch.float8_e5m2, torch.float8_e4m3fn}
+
+    def _get_innermost_stride_dim(self, tensor: torch.Tensor, name: str = "") -> int:
+        """Return index of innermost contiguous dimension (stride == 1).
+
+        :raises RuntimeError: If no dimension with stride 1 is found.
+        """
+        idx = next((i for i, s in enumerate(tensor.stride()) if s == 1), None)
+        if idx is None:
+            self._logger.critical(
+                f"tensor {name} has shape: {tensor.shape} stride {tensor.stride()} – innermost contiguous (stride == 1) dimension not found. "
+            )
+            raise RuntimeError(f"tensor {name} has shape: {tensor.shape} stride {tensor.stride()} – innermost contiguous (stride == 1) dimension not found. ")
+        return idx
+
+    def _tensor_shape(
+        self,
+        tensor: Optional[torch.Tensor],
+        name: str = "",
+    ) -> Optional[Tuple[int, ...]]:
+        """Get the logical shape of a tensor, handling FP4x2 packed datatypes.
+
+        For FP4x2 datatypes, two values are packed per byte. The innermost
+        contiguous dimension (with stride 1) contains packed values, so the
+        logical shape for that dimension is 2x the physical shape.
+
+        :param tensor: The tensor to get shape from (or None)
+        :type tensor: torch.Tensor or None
+        :param name: Logical tensor name for logging
+        :type name: str
+        :return: The logical shape tuple (or None if tensor is None)
+        :rtype: Tuple[int, ...] or None
+        """
+        if tensor is None:
+            return None
+
+        if self._is_fp4x2(tensor):
+            innermost_dim_index = self._get_innermost_stride_dim(tensor, name=name)
+            shape = tuple(dim * 2 if i == innermost_dim_index else dim for i, dim in enumerate(tensor.shape))
+            self._logger.debug(f"FP4x2 tensor {name}: physical shape {tensor.shape} -> logical shape {shape}")
+            return shape
+        else:
+            return tensor.shape
+
+    def _tensor_stride(
+        self,
+        tensor: Optional[torch.Tensor],
+        name: str = "",
+    ) -> Optional[Tuple[int, ...]]:
+        """Get the logical stride of a tensor, handling FP4x2 packed datatypes.
+
+        For FP4x2 datatypes, two values are packed per byte. The strides must
+        be adjusted to reflect logical element spacing. All strides are
+        multiplied by 2 since each physical element contains 2 logical elements.
+
+        :param tensor: The tensor to get stride from (or None)
+        :type tensor: torch.Tensor or None
+        :param name: Logical tensor name for logging
+        :type name: str
+        :return: The logical stride tuple (or None if tensor is None)
+        :rtype: Tuple[int, ...] or None
+        """
+        if tensor is None:
+            return None
+
+        if self._is_fp4x2(tensor):
+            innermost_dim_index = self._get_innermost_stride_dim(tensor, name=name)
+            strides = tuple(s * 2 if i != innermost_dim_index else s for i, s in enumerate(tensor.stride()))
+            self._logger.debug(f"FP4x2 tensor {name}: physical stride {tensor.stride()} -> logical stride {strides}")
+            return strides
+        else:
+            return tensor.stride()
+
+    def _check_tensor_shape(
+        self,
+        tensor_or_shape: torch.Tensor | Tuple[int, ...],
+        shape: Tuple[int, ...] | List[Tuple[int, ...]],
+        name: str = "",
+    ) -> Optional[Tuple[int, ...]]:
+        """Check if the shape of a tensor matches the expected shape(s).
+
+        :param tensor_or_shape: The tensor to get shape from or the shape to check
+        :type tensor_or_shape: torch.Tensor | Tuple[int, ...]
+        :param shape: expected shape or list of expected shapes
+        :type shape: Tuple[int, ...] | List[Tuple[int, ...]]
+        :param name: Logical tensor name for logging
+        :type name: str
+        :raises ValueError: If the shape of the tensor does not match the expected shape(s)
+        :return: The logical shape of the tensor
+        :rtype: Optional[Tuple[int, ...]]
+        """
+        if tensor_or_shape is None:
+            return None
+        tensor_shape = self._tensor_shape(tensor_or_shape, name=name) if isinstance(tensor_or_shape, torch.Tensor) else tensor_or_shape
+        if isinstance(shape, tuple):
+            if tensor_shape != shape:
+                raise ValueError(f"{name} tensor shape mismatch: expected {shape}, got {tensor_shape}")
+        elif isinstance(shape, list):
+            if tensor_shape not in shape:
+                raise ValueError(f"{name} tensor shape mismatch: expected one of {shape}, got {tensor_shape}")
+        else:
+            raise ValueError(f"Expected shape to be a tuple or list, got {type(shape)}")
+        return tensor_shape
+
+    def _check_tensor_stride(
+        self,
+        tensor_or_stride: torch.Tensor | Tuple[int, ...],
+        stride: Optional[Tuple[int, ...] | List[Tuple[int, ...]]] = None,
+        stride_order: Optional[Tuple[int, ...] | List[Tuple[int, ...]]] = None,
+        name: str = "",
+        extra_error_msg: str = "",
+    ) -> Optional[Tuple[Tuple[int, ...], Tuple[int, ...]]]:
+        """Check if the stride of a tensor matches the expected stride(s) or stride order(s).
+
+        :param tensor_or_stride: The tensor to get stride from or the stride to check
+        :type tensor_or_stride: torch.Tensor | Tuple[int, ...]
+        :param stride: The expected stride(s)
+        :type stride: Tuple[int, ...] | List[Tuple[int, ...]]
+        :param stride_order: The expected stride order(s)
+        :type stride_order: Tuple[int, ...] | List[Tuple[int, ...]]
+        :param name: Logical tensor name for logging
+        :type name: str
+        :param extra_error_msg: Extra error message to add to the error
+        :type extra_error_msg: str
+        :raises ValueError: If the stride of the tensor does not match the expected stride order
+        :return: The stride and stride order of the tensor
+        :rtype: Optional[Tuple[Tuple[int, ...], Tuple[int, ...]]]
+        """
+        if tensor_or_stride is None:
+            return None, None
+        tensor_stride = self._tensor_stride(tensor_or_stride, name=name) if isinstance(tensor_or_stride, torch.Tensor) else tensor_or_stride
+        tensor_stride_order = tuple(i for i, s in sorted(enumerate(tensor_stride), key=lambda x: x[1]))
+
+        if stride is not None:
+            if isinstance(stride, tuple):
+                if tensor_stride != stride:
+                    error_msg = f"{name} tensor stride mismatch: expected {stride}, got {tensor_stride}"
+                    if extra_error_msg:
+                        error_msg += f": {extra_error_msg}"
+                    raise ValueError(error_msg)
+            elif isinstance(stride, list):
+                if tensor_stride not in stride:
+                    error_msg = f"{name} tensor stride mismatch: expected one of {stride}, got {tensor_stride}"
+                    if extra_error_msg:
+                        error_msg += f": {extra_error_msg}"
+                    raise ValueError(error_msg)
+            else:
+                error_msg = f"Expected stride to be a tuple or list, got {type(stride)}"
+                if extra_error_msg:
+                    error_msg += f": {extra_error_msg}"
+                raise ValueError(error_msg)
+        if stride_order is not None:
+            if isinstance(stride_order, tuple):
+                if tensor_stride_order != stride_order:
+                    error_msg = f"{name} tensor stride order mismatch: expected {stride_order}, got {tensor_stride_order}"
+                    if extra_error_msg:
+                        error_msg += f": {extra_error_msg}"
+                    raise ValueError(error_msg)
+            elif isinstance(stride_order, list):
+                if tensor_stride_order not in stride_order:
+                    error_msg = f"{name} tensor stride order mismatch: expected one of {stride_order}, got {tensor_stride_order}"
+                    if extra_error_msg:
+                        error_msg += f": {extra_error_msg}"
+                    raise ValueError(error_msg)
+            else:
+                error_msg = f"Expected stride order to be a tuple or list, got {type(stride_order)}"
+                if extra_error_msg:
+                    error_msg += f": {extra_error_msg}"
+                raise ValueError(error_msg)
+        return tensor_stride, tensor_stride_order
+
+    def _check_dtype(
+        self,
+        tensor_or_dtype: torch.Tensor | torch.dtype,
+        dtype: torch.dtype | List[torch.dtype],
+        name: str = "",
+        extra_error_msg: str = "",
+    ) -> Optional[torch.dtype]:
+        """Check if the dtype of a tensor or dtype matches the expected dtype(s).
+
+        :param tensor_or_dtype: The tensor to get dtype from or the dtype to check
+        :type tensor_or_dtype: torch.Tensor | torch.dtype
+        :param dtype: The expected dtype(s)
+        :type dtype: torch.dtype | List[torch.dtype]
+        :param name: Logical tensor name for logging
+        :type name: str
+        :raises ValueError: If the dtype of the tensor does not match the expected dtype(s)
+        :return: The dtype of the tensor
+        :rtype: Optional[torch.dtype]
+        """
+        if tensor_or_dtype is None:
+            return None
+        tensor_dtype = tensor_or_dtype.dtype if isinstance(tensor_or_dtype, torch.Tensor) else tensor_or_dtype
+        if isinstance(dtype, torch.dtype):
+            if tensor_dtype != dtype:
+                error_msg = f"{name} dtype mismatch: expected {dtype}, got {tensor_dtype}"
+                if extra_error_msg:
+                    error_msg += f": {extra_error_msg}"
+                raise ValueError(error_msg)
+        elif isinstance(dtype, list):
+            if tensor_dtype not in dtype:
+                error_msg = f"{name} dtype mismatch: expected one of {dtype}, got {tensor_dtype}"
+                if extra_error_msg:
+                    error_msg += f": {extra_error_msg}"
+                raise ValueError(error_msg)
+        else:
+            raise ValueError(f"Expected dtype to be a torch.dtype or list, got {type(dtype)}")
+        return tensor_dtype
+
+    def _value_error_if(self, condition: bool, error_msg: str) -> None:
+        """Raise a ValueError if the condition is true.
+
+        :param condition: The condition to check
+        :type condition: bool
+        :param error_msg: The error message to raise
+        :type error_msg: str
+        :raises ValueError: If the condition is true
+        """
+        if condition:
+            raise ValueError(error_msg)
+
+    def _not_implemented_error_if(self, condition: bool, error_msg: str) -> None:
+        """Raise a NotImplementedError if the condition is true.
+
+        :param condition: The condition to check
+        :type condition: bool
+        :param error_msg: The error message to raise
+        :type error_msg: str
+        :raises NotImplementedError: If the condition is true
+        """
+        if condition:
+            raise NotImplementedError(error_msg)
+
+    def _runtime_error_if(self, condition: bool, error_msg: str) -> None:
+        """Raise a RuntimeError if the condition is true.
+
+        :param condition: The condition to check
+        :type condition: bool
+        :param error_msg: The error message to raise
+        :type error_msg: str
+        :raises RuntimeError: If the condition is true
+        """
+        if condition:
+            raise RuntimeError(error_msg)
+
+    def _make_cute_pointer(self, tensor: torch.Tensor, assumed_align: int = 16) -> cute.Pointer:
+        """Make a cute.Pointer for a tensor.
+
+        :param tensor: The tensor to make a cute.Pointer for
+        :type tensor: torch.Tensor
+        :param assumed_align: The assumed alignment of the tensor
+        :type assumed_align: int
+        :return: A cute.Pointer for the tensor
+        :rtype: cute.Pointer
+        """
+        if tensor is None:
+            return None
+        return cute.runtime.make_ptr(
+            _convert_to_cutlass_data_type(tensor.dtype, interpret_uint8_as_fp4x2=self._interpret_uint8_as_fp4x2),
+            tensor.data_ptr(),
+            cute.AddressSpace.gmem,
+            assumed_align=assumed_align,
+        )
+
+    def _make_cute_tensor_descriptor(
+        self, tensor: torch.Tensor, assumed_align: int = 16, name: str = ""
+    ) -> Tuple[cute.Pointer, Tuple[int, ...], Tuple[int, ...]]:
+        """Make a cute.Pointer, shape, and order for a tensor.
+
+        :param tensor: The tensor to make a cute.Pointer, shape, and order for
+        :type tensor: torch.Tensor
+        :param assumed_align: The assumed alignment of the tensor
+        :type assumed_align: int
+        :param name: Logical tensor name for logging
+        :type name: str
+        :return: A cute.Pointer, shape, and stride order for the tensor
+        :rtype: Tuple[cute.Pointer, Tuple[int, ...], Tuple[int, ...]]
+        """
+        if tensor is None:
+            return None, None, None
+        tensor_ptr = self._make_cute_pointer(tensor, assumed_align=assumed_align)
+        tensor_shape = self._tensor_shape(tensor, name=name)
+        tensor_stride = self._tensor_stride(tensor, name=name)
+        tensor_stride_order = tuple(i for i, s in sorted(enumerate(tensor_stride), key=lambda x: x[1]))
+        return tensor_ptr, tensor_shape, tensor_stride_order
+
+
+class TupleDict(dict):
+    """A dictionary that supports tuple unpacking.
+
+    This class extends dict to allow unpacking like a tuple while still
+    providing dictionary-style key access. The unpacking order is determined
+    by the _keys attribute which preserves insertion order.
+
+    Example:
+        >>> result = TupleDict(a=1, b=2, c=3)
+        >>> x, y, z = result  # Unpacks as (1, 2, 3)
+        >>> result['a']  # Returns 1
+        >>> result[0]  # Returns 1 (integer indexing)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Store keys in order for tuple unpacking
+        self._keys = list(self.keys())
+
+    def __iter__(self):
+        """Iterate over values in insertion order for tuple unpacking."""
+        return (self[k] for k in self._keys)
+
+    def __getitem__(self, key):
+        """Support both string keys and integer indices."""
+        if isinstance(key, int):
+            if key < 0 or key >= len(self._keys):
+                raise IndexError(f"index {key} out of range for TupleDict with {len(self._keys)} items")
+            return super().__getitem__(self._keys[key])
+        return super().__getitem__(key)
diff --git a/third_party/cudnn-frontend/python/cudnn/datatypes.py b/third_party/cudnn-frontend/python/cudnn/datatypes.py
new file mode 100644
index 00000000..ebb024e5
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/datatypes.py
@@ -0,0 +1,170 @@
+import sys
+import importlib
+
+
+def is_windows():
+    return sys.platform.startswith("win")
+
+
+module_name = ".Release._compiled_module" if is_windows() else "._compiled_module"
+
+_pybind_module = importlib.import_module(module_name, package="cudnn")
+
+globals()["cudnn_data_type"] = getattr(_pybind_module, "data_type")
+
+torch_available = None
+_torch_to_cudnn_data_type_dict = None
+
+# Optional CUTLASS integration
+cutlass_available = None
+_torch_to_cutlass_data_type_dict = None
+
+
+def is_torch_available():
+    global torch_available, _torch_to_cudnn_data_type_dict
+    # this condition ensures that datatype mapping is only created once
+    if torch_available is None:
+        try:
+            import torch
+
+            torch_available = True
+            _torch_to_cudnn_data_type_dict = {
+                torch.half: cudnn_data_type.HALF,
+                torch.float16: cudnn_data_type.HALF,
+                torch.bfloat16: cudnn_data_type.BFLOAT16,
+                torch.float: cudnn_data_type.FLOAT,
+                torch.float32: cudnn_data_type.FLOAT,
+                torch.double: cudnn_data_type.DOUBLE,
+                torch.float64: cudnn_data_type.DOUBLE,
+                torch.int8: cudnn_data_type.INT8,
+                torch.int32: cudnn_data_type.INT32,
+                torch.int64: cudnn_data_type.INT64,
+                torch.uint8: cudnn_data_type.UINT8,
+                torch.bool: cudnn_data_type.BOOLEAN,
+            }
+
+            def possibly_add_type(torch_type_name, cudnn_type):
+                # Only try adding the type if the version of torch being used supports it
+                if hasattr(torch, torch_type_name):
+                    torch_type = getattr(torch, torch_type_name)
+                    _torch_to_cudnn_data_type_dict[torch_type] = cudnn_type
+
+            possibly_add_type("float8_e4m3fn", cudnn_data_type.FP8_E4M3)
+            possibly_add_type("float8_e5m2", cudnn_data_type.FP8_E5M2)
+
+            possibly_add_type("float8_e8m0fnu", cudnn_data_type.FP8_E8M0)
+            possibly_add_type("float4_e2m1fn_x2", cudnn_data_type.FP4_E2M1)
+
+        except ImportError:
+            torch_available = False
+            _torch_to_cudnn_data_type_dict = {}
+    return torch_available
+
+
+def is_cutlass_available():
+    global cutlass_available, _torch_to_cutlass_data_type_dict
+    if cutlass_available is None:
+        try:
+            import torch
+            import cutlass
+
+            cutlass_available = True
+            mapping = {
+                torch.half: getattr(cutlass, "Float16", None),
+                getattr(torch, "float16", torch.half): getattr(cutlass, "Float16", None),
+                getattr(torch, "bfloat16", None): getattr(cutlass, "BFloat16", None),
+                torch.float: getattr(cutlass, "Float32", None),
+                getattr(torch, "float32", torch.float): getattr(cutlass, "Float32", None),
+                torch.double: getattr(cutlass, "Float64", None),
+                getattr(torch, "float64", torch.double): getattr(cutlass, "Float64", None),
+                getattr(torch, "int8", None): getattr(cutlass, "Int8", None),
+                getattr(torch, "int32", None): getattr(cutlass, "Int32", None),
+                getattr(torch, "int64", None): getattr(cutlass, "Int64", None),
+                getattr(torch, "uint8", None): getattr(cutlass, "Uint8", None),
+                getattr(torch, "bool", None): getattr(cutlass, "Boolean", None),
+                getattr(torch, "float8_e4m3fn", None): getattr(cutlass, "Float8E4M3FN", None),
+                getattr(torch, "float8_e5m2", None): getattr(cutlass, "Float8E5M2", None),
+                getattr(torch, "float8_e8m0fnu", None): getattr(cutlass, "Float8E8M0FNU", None),
+                getattr(torch, "float4_e2m1fn_x2", None): getattr(cutlass, "Float4E2M1FN", None),
+            }
+            _torch_to_cutlass_data_type_dict = {t: c for t, c in mapping.items() if t is not None and c is not None}
+        except ImportError:
+            cutlass_available = False
+            _torch_to_cutlass_data_type_dict = {}
+    return cutlass_available
+
+
+# Returns None in case mapping is not available
+def _torch_to_cudnn_data_type(torch_data_type) -> cudnn_data_type:
+    if is_torch_available():
+        return _torch_to_cudnn_data_type_dict.get(torch_data_type, None)
+    else:
+        return None
+
+
+def _torch_to_cutlass_data_type(data_type, interpret_uint8_as_fp4x2: bool = False):
+    if is_cutlass_available() and is_torch_available():
+        import torch
+
+        if interpret_uint8_as_fp4x2 and data_type == torch.uint8:
+            import cutlass
+
+            return getattr(cutlass, "Float4E2M1FN", None)
+        else:
+            return _torch_to_cutlass_data_type_dict.get(data_type, None)
+    return None
+
+
+def _convert_to_cutlass_data_type(data_type, interpret_uint8_as_fp4x2: bool = False):
+    if is_cutlass_available():
+        import cutlass
+
+        if isinstance(data_type, type) and issubclass(data_type, cutlass.Numeric):
+            return data_type
+        elif data_type is not None:
+            cutlass_data_type = _torch_to_cutlass_data_type(data_type, interpret_uint8_as_fp4x2=interpret_uint8_as_fp4x2)
+            if cutlass_data_type is None:
+                raise ValueError("Unsupported tensor data type.")
+            return cutlass_data_type
+        else:
+            raise ValueError("None is not a valid tensor data type.")
+    return None
+
+
+def _cudnn_to_torch_data_type(cudnn_data_type):
+    """Convert a cuDNN data type to a PyTorch data type.
+
+    Args:
+        cudnn_data_type: The cuDNN data type to convert.
+
+    Returns:
+        The PyTorch data type, or None if the conversion is not available.
+    """
+    if is_torch_available():
+        for torch_type, cudnn_type in _torch_to_cudnn_data_type_dict.items():
+            if cudnn_type == cudnn_data_type:
+                return torch_type
+    return None
+
+
+def _library_type(input_type):
+    if type(input_type) is cudnn_data_type:
+        return input_type
+
+    for cvt_fn in [
+        _torch_to_cudnn_data_type,
+        # Add more DL libraries to support here
+    ]:
+        out = cvt_fn(input_type)
+        if out is not None:
+            return out
+
+    raise Exception(f"No available conversion from type {input_type} to a library type.")
+
+
+def _is_torch_tensor(input_tensor) -> bool:
+    if is_torch_available():
+        import torch
+
+        return isinstance(input_tensor, torch.Tensor)
+    return False
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_amax/__init__.py b/third_party/cudnn-frontend/python/cudnn/gemm_amax/__init__.py
new file mode 100644
index 00000000..b65cf491
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_amax/__init__.py
@@ -0,0 +1,9 @@
+from .api import (
+    GemmAmaxSm100,
+    gemm_amax_wrapper_sm100,
+)
+
+__all__ = [
+    "GemmAmaxSm100",
+    "gemm_amax_wrapper_sm100",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_amax/api.py b/third_party/cudnn-frontend/python/cudnn/gemm_amax/api.py
new file mode 100644
index 00000000..79fbade1
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_amax/api.py
@@ -0,0 +1,543 @@
+from .dense_blockscaled_gemm_persistent_amax import (
+    Sm100BlockScaledPersistentDenseGemmKernel,
+    Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+)
+
+from cuda.bindings import driver as cuda
+import torch
+from typing import Tuple, Optional
+from packaging import version
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+
+from cudnn.datatypes import _convert_to_cutlass_data_type
+from cudnn.api_base import APIBase, is_power_of_2, ceil_div
+
+
+class GemmAmaxSm100(APIBase):
+    def __init__(
+        self,
+        sample_a: torch.Tensor,
+        sample_b: torch.Tensor,
+        sample_sfa: torch.Tensor,
+        sample_sfb: torch.Tensor,
+        sample_c: torch.Tensor,
+        sample_amax: torch.Tensor,
+        acc_dtype: torch.dtype = torch.float32,
+        mma_tiler_mn: Tuple[int, int] = (128, 128),
+        cluster_shape_mn: Tuple[int, int] = (1, 1),
+        sf_vec_size: int = 32,
+    ):
+        super().__init__()
+
+        self._logger.warning("GemmAmaxSm100 is an experimental API")
+        self._logger.debug("Entering __init__")
+
+        self.sample_a = sample_a
+        self.sample_b = sample_b
+        self.sample_sfa = sample_sfa
+        self.sample_sfb = sample_sfb
+        self.sample_c = sample_c
+        self.sample_amax = self._pad_tensor_to_ndim(sample_amax, 3, "sample_amax")
+        self.acc_dtype = acc_dtype
+        self.mma_tiler_mn = mma_tiler_mn
+        self.cluster_shape_mn = cluster_shape_mn
+        self.sf_vec_size = sf_vec_size
+
+        # used to reshape sfa/sfb tensors to atom layout
+        self.atom_m = (32, 4)
+        self.atom_k = 4
+
+        self._interpret_uint8_as_fp4x2 = True
+        self._logger.debug(
+            f"__init__ completed with args: sample_a {sample_a.shape}, sample_b {sample_b.shape}, sample_sfa {sample_sfa.shape}, sample_sfb {sample_sfb.shape}, sample_c {sample_c.shape}, sample_amax {self.sample_amax.shape}, acc_dtype {acc_dtype}, mma_tiler_mn {mma_tiler_mn}, cluster_shape_mn {cluster_shape_mn}, sf_vec_size {sf_vec_size}"
+        )
+
+    def check_support(self) -> bool:
+        self._logger.debug("Entering check_support")
+
+        self._logger.debug("Checking dtypes and sf_vec_size")
+        ab_dtype = self._check_dtype(
+            self.sample_a,
+            dtype=[torch.float4_e2m1fn_x2, torch.uint8, torch.float8_e5m2, torch.float8_e4m3fn],
+            name="A",
+        )
+        self._check_dtype(
+            self.sample_b,
+            dtype=ab_dtype,
+            name="B",
+            extra_error_msg="A and B tensor dtypes must match",
+        )
+        if ab_dtype == torch.uint8:
+            self._logger.warning("Uint8 ab_dtype will be interpreted as packed fp4, not as native uint8")
+
+        self._value_error_if(
+            self.sf_vec_size not in {16, 32},
+            f"Unsupported sf_vec_size: received {self.sf_vec_size}, expected {{16, 32}}",
+        )
+
+        sf_dtype = self._check_dtype(
+            self.sample_sfa,
+            dtype=[torch.float8_e8m0fnu, torch.float8_e4m3fn, torch.int8],
+            name="sfa",
+        )
+        self._check_dtype(
+            self.sample_sfb,
+            dtype=sf_dtype,
+            name="sfb",
+            extra_error_msg="sfa and sfb tensor dtypes must match",
+        )
+        if sf_dtype == torch.int8:
+            self._logger.warning("Int8 sf_dtype will be interpreted as float8_e8m0fnu, not as native int8")
+
+        self._value_error_if(
+            sf_dtype == torch.float8_e4m3fn and self.sf_vec_size == 32,
+            "Unsupported sf_dtype and sf_vec_size combination: float8_e4m3fn and 32 is not supported",
+        )
+        self._value_error_if(
+            ab_dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and self.sf_vec_size == 16,
+            f"Unsupported ab_dtype and sf_vec_size combination: {{float8_e5m2, float8_e4m3fn}} and 16 is not supported",
+        )
+
+        c_dtype = self._check_dtype(
+            self.sample_c,
+            dtype=[torch.float32, torch.float16, torch.bfloat16, torch.float8_e5m2, torch.float8_e4m3fn, torch.float4_e2m1fn_x2, torch.uint8],
+            name="C",
+        )
+        self._value_error_if(
+            self._is_fp4x2(c_dtype) and not self._is_fp4x2(ab_dtype),
+            f"Unsupported c_dtype and ab_dtype combination: fp4 c_dtype requires fp4 ab_dtype, got {ab_dtype}",
+        )
+        self._not_implemented_error_if(
+            self._is_fp8(c_dtype) and self._is_fp8(ab_dtype),
+            "Unsupported c_dtype and ab_dtype combination: fp8 ab_dtype and fp8 c_dtype (fails to launch)",
+        )
+        self._check_dtype(
+            self.acc_dtype,
+            dtype=torch.float32,
+            name="Accumulator",
+            extra_error_msg="Accumulator must be float32",
+        )
+
+        self.ab_dtype = ab_dtype
+        self.c_dtype = c_dtype
+
+        self._logger.debug("Checking tensor layout")
+        m, k, l = self._tensor_shape(self.sample_a, name="sample_a")
+        n, _, _ = self._tensor_shape(self.sample_b, name="sample_b")
+        _, _, _ = self._tensor_shape(self.sample_c, name="sample_c")
+        _, _, m_div_atom_m0_m1, _, sf_k_div_atom_k, _ = self.sample_sfa.shape
+        _, _, n_div_atom_m0_m1, _, sf_k_div_atom_k, _ = self.sample_sfb.shape
+
+        self._check_tensor_shape(self.sample_a, (m, k, l), "A")
+        self._check_tensor_shape(self.sample_b, (n, k, l), "B")
+        self._check_tensor_shape(self.sample_c, (m, n, l), "C")
+        self._check_tensor_shape(
+            self.sample_sfa,
+            (self.atom_m[0], self.atom_m[1], m_div_atom_m0_m1, self.atom_k, sf_k_div_atom_k, l),
+            "sfa",
+        )
+        self._check_tensor_shape(
+            self.sample_sfb,
+            (self.atom_m[0], self.atom_m[1], n_div_atom_m0_m1, self.atom_k, sf_k_div_atom_k, l),
+            "sfb",
+        )
+        self._check_tensor_shape(self.sample_amax, (1, 1, 1), "amax")
+
+        expected_m_div_atom = ceil_div(m, self.atom_m[0] * self.atom_m[1])
+        expected_n_div_atom = ceil_div(n, self.atom_m[0] * self.atom_m[1])
+        self._value_error_if(
+            m_div_atom_m0_m1 != expected_m_div_atom,
+            f"Input/Output shape mismatch: expected m_div_atom_m0_m1 (sfa.shape[2]) = {expected_m_div_atom}, got {m_div_atom_m0_m1}",
+        )
+        self._value_error_if(
+            n_div_atom_m0_m1 != expected_n_div_atom,
+            f"Input/Output shape mismatch: expected n_div_atom_m0_m1 (sfb.shape[2]) = {expected_n_div_atom}, got {n_div_atom_m0_m1}",
+        )
+
+        # Check tensor strides
+        a_stride, self.a_stride_order = self._check_tensor_stride(
+            self.sample_a,
+            stride=[(1, m, m * k), (k, 1, m * k)],
+            name="A",
+        )
+        b_stride, self.b_stride_order = self._check_tensor_stride(
+            self.sample_b,
+            stride=[(1, n, n * k), (k, 1, n * k)],
+            name="B",
+        )
+        c_stride, self.c_stride_order = self._check_tensor_stride(
+            self.sample_c,
+            stride=[(1, m, m * n), (n, 1, m * n)],
+            name="C",
+        )
+
+        # Derive major mode from stride order
+        self.a_major = "m" if self.a_stride_order == (0, 1, 2) else "k"
+        self.b_major = "n" if self.b_stride_order == (0, 1, 2) else "k"
+        self.c_major = "m" if self.c_stride_order == (0, 1, 2) else "n"
+
+        self._value_error_if(
+            self._is_fp4x2(ab_dtype) and not (self.a_major == "k" and self.b_major == "k"),
+            f"Unsupported A or B tensor stride: Float4 tensors require k-major layout for hardware efficiency, got {self.a_major} and {self.b_major}",
+        )
+        self._value_error_if(
+            self._is_fp4x2(c_dtype) and self.c_major == "m",
+            f"Unsupported C tensor stride: Float4 tensors require n-major layout for hardware efficiency, got {self.c_major}",
+        )
+
+        self._logger.debug("Checking mma tiler and cluster shape")
+        self._value_error_if(
+            self.mma_tiler_mn[0] not in [128, 256],
+            f"Unsupported mma_tiler_mn[0]: expected {{128, 256}}, got {self.mma_tiler_mn[0]}",
+        )
+        self._value_error_if(
+            self.mma_tiler_mn[1] not in [128, 256],
+            f"Unsupported mma_tiler_mn[1]: expected {{128, 256}}, got {self.mma_tiler_mn[1]}",
+        )
+        self._not_implemented_error_if(
+            self.mma_tiler_mn[0] == 256,
+            "mma_tiler_mn[0] == 256 currently hangs",
+        )
+        self._value_error_if(
+            self._is_fp4x2(self.ab_dtype) and self.mma_tiler_mn[1] == 256 and k <= 128,
+            f"mma_tiler_mn (X, 256) requires k > 128 (packed x2), got {k}",
+        )
+        self._value_error_if(
+            not (self.cluster_shape_mn[0] % (2 if self.mma_tiler_mn[0] == 256 else 1) == 0),
+            "Illegal cluster shape",
+        )
+        self._not_implemented_error_if(
+            self.mma_tiler_mn == (128, 256) and self.sf_vec_size == 16 and c_dtype in {torch.float32, torch.float16, torch.bfloat16},
+            "mma_tiler_mn (128, 256), sf_vec_size 16, c_dtype {torch.float32, torch.float16, torch.bfloat16} fails to launch",
+        )
+
+        # Special cluster shape check for scale factor multicasts.
+        # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+        self._value_error_if(
+            not (
+                self.cluster_shape_mn[0] <= 4
+                and self.cluster_shape_mn[1] <= 4
+                and self.cluster_shape_mn[0] > 0
+                and self.cluster_shape_mn[1] > 0
+                and is_power_of_2(self.cluster_shape_mn[0])
+                and is_power_of_2(self.cluster_shape_mn[1])
+            ),
+            f"Invalid cluster shape: expected cluster_shape_mn values in {{1, 2, 4}}, got {self.cluster_shape_mn}",
+        )
+
+        self._logger.debug("Checking tensor alignment")
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // (_convert_to_cutlass_data_type(dtype).width)
+            return num_major_elements % num_contiguous_elements == 0
+
+        self._value_error_if(
+            not (
+                check_contigous_16B_alignment(ab_dtype, self.a_major == "m", (m, k, l))
+                and check_contigous_16B_alignment(ab_dtype, self.b_major == "n", (n, k, l))
+                and check_contigous_16B_alignment(c_dtype, self.c_major == "m", (m, n, l))
+            ),
+            "Unsupported tensor alignment: tensors must be 16B aligned",
+        )
+
+        self._logger.debug("Checking environment")
+        self._runtime_error_if(not torch.cuda.is_available(), "CUDA is not available")
+        device = torch.cuda.current_device()
+        major, minor = torch.cuda.get_device_capability(device)
+        compute_capability = major * 10 + minor
+        self._runtime_error_if(
+            compute_capability < 100,
+            f"GemmAmax requires SM100+ compute capability, but found SM{compute_capability} on device {device}",
+        )
+        self._runtime_error_if(
+            compute_capability == 103,
+            "cuteDSL GemmAmax is not supported on SM103",
+        )
+
+        is_ab_fp4 = self._is_fp4x2(self.ab_dtype)
+        is_c_fp4 = self._is_fp4x2(self.c_dtype)
+        is_ab_fp8 = self._is_fp8(self.ab_dtype)
+        torch_version = version.parse(torch.__version__)
+        _fp8_dlpack_supported = version.parse(torch_version.base_version) >= version.parse("2.10.0")
+        use_no_dlpack_kernel = is_ab_fp4 or is_c_fp4 or (is_ab_fp8 and not _fp8_dlpack_supported)
+
+        if use_no_dlpack_kernel:
+            self._logger.debug("Running no_dlpack kernel wrapper due to fp4 dtype or fp8 dtype on incompatible torch version")
+            self._kernel = Sm100BlockScaledPersistentDenseGemmKernelNoDlpack
+        else:
+            self._kernel = Sm100BlockScaledPersistentDenseGemmKernel
+
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        self._logger.debug("Entering compile")
+        current_stream = self._get_default_stream(current_stream)
+        self._ensure_support_checked()
+
+        gemm_amax = self._kernel(
+            sf_vec_size=self.sf_vec_size,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+        )
+        hardware_info = cutlass.utils.HardwareInfo()
+        max_active_clusters = hardware_info.get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1])
+
+        if self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+            self._logger.debug("Compiling gemm_amax")
+            self._compiled_kernel = cute.compile(
+                gemm_amax,
+                a_tensor=from_dlpack(self.sample_a, assumed_align=16),
+                b_tensor=from_dlpack(self.sample_b, assumed_align=16),
+                sfa_tensor=from_dlpack(self.sample_sfa, assumed_align=16),
+                sfb_tensor=from_dlpack(self.sample_sfb, assumed_align=16),
+                c_tensor=from_dlpack(self.sample_c, assumed_align=16),
+                amax_tensor=from_dlpack(self.sample_amax, assumed_align=16),
+                max_active_clusters=max_active_clusters,
+                stream=current_stream,
+            )
+        elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+            # Create cute pointers/tensors manually to avoid DLPack requirements
+            # amax is never fp4 or fp8 and is safe to use directly with dlpack
+            self._logger.debug("Compiling gemm_amax (no dlpack)")
+
+            is_ab_fp4 = self._is_fp4x2(self.ab_dtype)
+            is_c_fp4 = self._is_fp4x2(self.c_dtype)
+            a_ptr, a_shape, a_stride_order = self._make_cute_tensor_descriptor(self.sample_a, assumed_align=32 if is_ab_fp4 else 16, name="A")
+            b_ptr, b_shape, b_stride_order = self._make_cute_tensor_descriptor(self.sample_b, assumed_align=32 if is_ab_fp4 else 16, name="B")
+            c_ptr, c_shape, c_stride_order = self._make_cute_tensor_descriptor(self.sample_c, assumed_align=32 if is_c_fp4 else 16, name="C")
+            sfa_ptr, sfa_shape, sfa_stride_order = self._make_cute_tensor_descriptor(self.sample_sfa, assumed_align=16, name="sfa")
+            sfb_ptr, sfb_shape, sfb_stride_order = self._make_cute_tensor_descriptor(self.sample_sfb, assumed_align=16, name="sfb")
+
+            self._compiled_kernel = cute.compile(
+                gemm_amax,
+                a_ptr=a_ptr,
+                a_shape=a_shape,
+                a_order=a_stride_order,
+                b_ptr=b_ptr,
+                b_shape=b_shape,
+                b_order=b_stride_order,
+                sfa_ptr=sfa_ptr,
+                sfa_shape=sfa_shape,
+                sfa_order=sfa_stride_order,
+                sfb_ptr=sfb_ptr,
+                sfb_shape=sfb_shape,
+                sfb_order=sfb_stride_order,
+                c_ptr=c_ptr,
+                c_shape=c_shape,
+                c_order=c_stride_order,
+                amax_cute=from_dlpack(self.sample_amax, assumed_align=16),
+                max_active_clusters=max_active_clusters,
+                stream=current_stream,
+            )
+        else:
+            raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+        self._logger.debug("Kernel compiled successfully")
+
+    def execute(
+        self,
+        a_tensor: torch.Tensor,
+        b_tensor: torch.Tensor,
+        sfa_tensor: torch.Tensor,
+        sfb_tensor: torch.Tensor,
+        c_tensor: torch.Tensor,
+        amax_tensor: torch.Tensor,
+        current_stream: Optional[cuda.CUstream] = None,
+        skip_compile: bool = False,
+    ) -> None:
+        self._logger.debug("Entering execute")
+        current_stream = self._get_default_stream(current_stream)
+
+        amax_tensor = self._pad_tensor_to_ndim(amax_tensor, 3, "amax_tensor")
+
+        is_ab_fp4 = self._is_fp4x2(self.ab_dtype)
+        is_c_fp4 = self._is_fp4x2(self.c_dtype)
+
+        if not skip_compile:
+            self._runtime_error_if(
+                self._compiled_kernel is None,
+                "GemmAmaxSm100 kernel not compiled; call compile() first or use execute(skip_compile=True)",
+            )
+            self._logger.debug("Executing with compiled kernel")
+
+            if self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+                self._compiled_kernel(
+                    a_tensor=from_dlpack(a_tensor, assumed_align=16),
+                    b_tensor=from_dlpack(b_tensor, assumed_align=16),
+                    sfa_tensor=from_dlpack(sfa_tensor, assumed_align=16),
+                    sfb_tensor=from_dlpack(sfb_tensor, assumed_align=16),
+                    c_tensor=from_dlpack(c_tensor, assumed_align=16),
+                    amax_tensor=from_dlpack(amax_tensor, assumed_align=16),
+                    stream=current_stream,
+                )
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+                a_ptr = self._make_cute_pointer(a_tensor, assumed_align=32 if is_ab_fp4 else 16)
+                b_ptr = self._make_cute_pointer(b_tensor, assumed_align=32 if is_ab_fp4 else 16)
+                c_ptr = self._make_cute_pointer(c_tensor, assumed_align=32 if is_c_fp4 else 16)
+                sfa_ptr = self._make_cute_pointer(sfa_tensor, assumed_align=16)
+                sfb_ptr = self._make_cute_pointer(sfb_tensor, assumed_align=16)
+
+                self._compiled_kernel(
+                    a_ptr=a_ptr,
+                    b_ptr=b_ptr,
+                    sfa_ptr=sfa_ptr,
+                    sfb_ptr=sfb_ptr,
+                    c_ptr=c_ptr,
+                    amax_cute=from_dlpack(amax_tensor, assumed_align=16),
+                    stream=current_stream,
+                )
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+            self._logger.debug("Executed with compiled kernel successfully")
+        else:
+            self._logger.debug("Executing without compiled kernel (JIT)")
+            gemm_amax = self._kernel(
+                sf_vec_size=self.sf_vec_size,
+                mma_tiler_mn=self.mma_tiler_mn,
+                cluster_shape_mn=self.cluster_shape_mn,
+            )
+            hardware_info = cutlass.utils.HardwareInfo()
+            max_active_clusters = hardware_info.get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1])
+
+            if self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+                gemm_amax(
+                    a_tensor=from_dlpack(a_tensor, assumed_align=16),
+                    b_tensor=from_dlpack(b_tensor, assumed_align=16),
+                    sfa_tensor=from_dlpack(sfa_tensor, assumed_align=16),
+                    sfb_tensor=from_dlpack(sfb_tensor, assumed_align=16),
+                    c_tensor=from_dlpack(c_tensor, assumed_align=16),
+                    amax_tensor=from_dlpack(amax_tensor, assumed_align=16),
+                    max_active_clusters=max_active_clusters,
+                    stream=current_stream,
+                )
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+                a_ptr, a_shape, a_stride_order = self._make_cute_tensor_descriptor(a_tensor, assumed_align=32 if is_ab_fp4 else 16, name="A")
+                b_ptr, b_shape, b_stride_order = self._make_cute_tensor_descriptor(b_tensor, assumed_align=32 if is_ab_fp4 else 16, name="B")
+                c_ptr, c_shape, c_stride_order = self._make_cute_tensor_descriptor(c_tensor, assumed_align=32 if is_c_fp4 else 16, name="C")
+                sfa_ptr, sfa_shape, sfa_stride_order = self._make_cute_tensor_descriptor(sfa_tensor, assumed_align=16, name="sfa")
+                sfb_ptr, sfb_shape, sfb_stride_order = self._make_cute_tensor_descriptor(sfb_tensor, assumed_align=16, name="sfb")
+
+                gemm_amax(
+                    a_ptr=a_ptr,
+                    a_shape=a_shape,
+                    a_order=a_stride_order,
+                    b_ptr=b_ptr,
+                    b_shape=b_shape,
+                    b_order=b_stride_order,
+                    sfa_ptr=sfa_ptr,
+                    sfa_shape=sfa_shape,
+                    sfa_order=sfa_stride_order,
+                    sfb_ptr=sfb_ptr,
+                    sfb_shape=sfb_shape,
+                    sfb_order=sfb_stride_order,
+                    c_ptr=c_ptr,
+                    c_shape=c_shape,
+                    c_order=c_stride_order,
+                    amax_cute=from_dlpack(amax_tensor, assumed_align=16),
+                    max_active_clusters=max_active_clusters,
+                    stream=current_stream,
+                )
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+        self._logger.debug("Executed successfully")
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_GemmAmaxSm100Objects = {}
+
+
+def gemm_amax_wrapper_sm100(
+    a_tensor: torch.Tensor,
+    b_tensor: torch.Tensor,
+    sfa_tensor: torch.Tensor,
+    sfb_tensor: torch.Tensor,
+    c_major: str = "n",
+    c_dtype: torch.dtype = torch.float32,
+    acc_dtype: torch.dtype = torch.float32,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    cluster_shape_mn: Tuple[int, int] = (1, 1),
+    sf_vec_size: int = 32,
+    stream: Optional[cuda.CUstream] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    _logger.debug("gemm_amax_wrapper_sm100: Creating empty output tensors c and amax")
+
+    m, _, l = a_tensor.shape
+    n, _, l = b_tensor.shape
+    c_tensor = None
+    if c_major == "m":
+        c_tensor = torch.empty_strided((m, n, l), (1, m, m * n), dtype=c_dtype, device=a_tensor.device)
+    elif c_major == "n":
+        c_tensor = torch.empty_strided((m, n, l), (n, 1, m * n), dtype=c_dtype, device=a_tensor.device)
+    else:
+        raise ValueError(f"c_major must be either 'm' or 'n', got {c_major}")
+    amax_tensor = torch.full((1, 1, 1), -float("inf"), device=a_tensor.device, dtype=torch.float32)
+
+    cache_key = (
+        a_tensor.shape,
+        b_tensor.shape,
+        sfa_tensor.shape,
+        sfb_tensor.shape,
+        a_tensor.dtype,
+        b_tensor.dtype,
+        sfa_tensor.dtype,
+        sfb_tensor.dtype,
+        a_tensor.stride(),
+        b_tensor.stride(),
+        sfa_tensor.stride(),
+        sfb_tensor.stride(),
+        c_major,
+        c_dtype,
+        acc_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sf_vec_size,
+    )
+    if cache_key in _cache_of_GemmAmaxSm100Objects:
+        _logger.debug("gemm_amax_wrapper_sm100: Using previously cached GemmAmaxSm100 object")
+        gemm_amax = _cache_of_GemmAmaxSm100Objects[cache_key]
+        gemm_amax.execute(
+            a_tensor=a_tensor,
+            b_tensor=b_tensor,
+            sfa_tensor=sfa_tensor,
+            sfb_tensor=sfb_tensor,
+            c_tensor=c_tensor,
+            amax_tensor=amax_tensor,
+            current_stream=stream,
+        )
+    else:
+        _logger.debug("gemm_amax_wrapper_sm100: No previously cached GemmAmaxSm100 object found, creating new GemmAmaxSm100 object")
+        gemm_amax = GemmAmaxSm100(
+            sample_a=a_tensor,
+            sample_b=b_tensor,
+            sample_sfa=sfa_tensor,
+            sample_sfb=sfb_tensor,
+            sample_c=c_tensor,
+            sample_amax=amax_tensor,
+            acc_dtype=acc_dtype,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            sf_vec_size=sf_vec_size,
+        )
+        assert gemm_amax.check_support()
+        gemm_amax.compile(current_stream=stream)
+        gemm_amax.execute(
+            a_tensor=a_tensor,
+            b_tensor=b_tensor,
+            sfa_tensor=sfa_tensor,
+            sfb_tensor=sfb_tensor,
+            c_tensor=c_tensor,
+            amax_tensor=amax_tensor,
+            current_stream=stream,
+        )
+        _cache_of_GemmAmaxSm100Objects[cache_key] = gemm_amax
+
+    return c_tensor, amax_tensor
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_amax/dense_blockscaled_gemm_persistent_amax.py b/third_party/cudnn-frontend/python/cudnn/gemm_amax/dense_blockscaled_gemm_persistent_amax.py
new file mode 100644
index 00000000..e7d112a2
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_amax/dense_blockscaled_gemm_persistent_amax.py
@@ -0,0 +1,1758 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from typing import Type, Tuple, Union
+
+import cuda.bindings.driver as cuda
+import torch
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, tcgen05
+from cutlass._mlir.dialects import math, nvvm, llvm
+from cutlass.cutlass_dsl import T
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import from_dlpack
+
+"""
+This example provides an experimental implementation of the SM100 batched dense blockscaled GEMM kernel, please note that the APIs and implementation details related to this kernel may change in future releases.
+
+A high-performance persistent batched dense blockscaled GEMM example for the NVIDIA Blackwell SM100 architecture
+using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, which has M×ceil_div(K, sf_vec_size)×L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, which has N×ceil_div(K, sf_vec_size)×L elements respectively
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp:
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Type convert C matrix to output type.
+    - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
+      or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
+    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Input arguments to this example is shown below:
+
+.. code-block:: bash
+
+    python examples/blackwell/dense_blockscaled_gemm_persistent.py            \
+      --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
+      --c_dtype Float16                                                        \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,1024,1
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/dense_blockscaled_gemm_persistent.py        \
+      --ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16        \
+      --c_dtype Float16                                                        \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                            \
+      --mnkl 8192,8192,1024,1                                                  \
+      --warmup_iterations 1 --iterations 10 --skip_ref_check
+
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 128 or 256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+"""
+
+
+class Sm100BlockScaledPersistentDenseGemmKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+        # {$nv-internal-release begin}
+        # Note: We don't have SFD generation support in this example for now, so Float4E2M1FN output is only for internal testing and will not be released.
+        - Float4E2M1FN
+        # {$nv-internal-release end}
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        # TODO: Add 64 and 192 support # {$nv-internal-release}
+        - MMA tiler N must be 128/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
+        ...     sf_vec_size=16,
+        ...     mma_tiler_mn=(256, 128),
+        ...     cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, amax_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator, always set to Float32
+            - sf_vec_size: Scalefactor A/B vector size.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param sf_vec_size: Scalefactor vector size.
+        :type sf_vec_size: int
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        """
+
+        self.acc_dtype = cutlass.Float32
+        self.sf_vec_size = sf_vec_size
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len((self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id))
+        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+
+        # Amax reduction configuration
+        self.num_epilog_warps = len(self.epilog_warp_id)
+
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B/SFA/SFB
+        - Computing epilogue subtile
+        - Setting up A/B/SFA/SFB/C stage counts in shared memory
+        - Computing A/B/SFA/SFB/C shared memory layout
+        """
+        # Compute mma instruction shapes
+        # (MMA_Tile_Shape_M, MMA_Tile_Shape_N, MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        # TODO: round up to 128, it is prepared for supporting N=64 or 192. # {$nv-internal-release}
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.num_mcast_ctas_sfb = cute.size(self.cluster_layout_sfb_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+        self.is_sfb_mcast = self.num_mcast_ctas_sfb > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/SFA/SFB/C shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_tensor: cute.Tensor,
+        b_tensor: cute.Tensor,
+        sfa_tensor: cute.Tensor,
+        sfb_tensor: cute.Tensor,
+        c_tensor: cute.Tensor,
+        amax_tensor: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel asynchronously
+
+        :param a_tensor: Input tensor A
+        :type a_tensor: cute.Tensor
+        :param b_tensor: Input tensor B
+        :type b_tensor: cute.Tensor
+        :param sfa_tensor: Scale factor tensor A
+        :type sfa_tensor: cute.Tensor
+        :param sfb_tensor: Scale factor tensor B
+        :type sfb_tensor: cute.Tensor
+        :param c_tensor: Output tensor C
+        :type c_tensor: cute.Tensor
+        :param amax_tensor: Output tensor for absolute maximum value
+        :type amax_tensor: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a_tensor.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b_tensor.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa_tensor.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c_tensor.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a_tensor).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b_tensor).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c_tensor)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a_tensor.shape, self.sf_vec_size)
+        sfa_tensor = cute.make_tensor(sfa_tensor.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b_tensor.shape, self.sf_vec_size)
+        sfb_tensor = cute.make_tensor(sfb_tensor.iterator, sfb_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs. # {$nv-internal-release}
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a_tensor,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b_tensor,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa_tensor,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb_tensor,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c_tensor,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c_tensor,
+            self.cta_tile_shape_mnk,
+            self.cluster_shape_mn,
+            max_active_clusters,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # Amax reduction shared memory (one FP32 per epilogue warp)
+            # Use smaller alignment for amax since it's only 16 bytes
+            sAmax: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, self.num_epilog_warps],
+                16,
+            ]
+
+        self.shared_storage = SharedStorage
+        # Launch the kernel asynchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            amax_tensor,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        mAmax: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(cta_rank_in_cluster)
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_tma_producer)
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_acc_consumer_threads)
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # TMEM dealloc barrier + holding buffer setup
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
+        tmem_holding_buf = storage.tmem_holding_buf
+
+        # Initialize TMEM cross-CTA dealloc mbarrier when using 2-CTA instructions
+        if use_2cta_instrs:
+            if warp_idx == self.tma_warp_id:
+                num_tmem_dealloc_threads = 32
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init(tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads)
+        cute.arch.mbarrier_init_fence()
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/SFA/SFB/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+
+        # Shared memory for amax reduction (one FP32 per epilogue warp)
+        # Simple 1D layout
+        amax_layout = cute.make_layout((self.num_epilog_warps,))
+        sAmax = storage.sAmax.get_tensor(amax_layout)
+
+        #
+        # Compute multicast mask for A/B/SFA/SFB buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1)
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1)
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, RestM, RestK, RestL)
+        gA_mkl = cute.local_tile(mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+        # (bN, bK, RestN, RestK, RestL)
+        gB_nkl = cute.local_tile(mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None))
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(mSFB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None))
+        # (bM, bN, RestM, RestN, RestL)
+        gC_mnl = cute.local_tile(mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None))
+        k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMALDG_SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMALDG_SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_ab_stage)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), RestK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                # ((atom_v, rest_v), RestK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B/SFA/SFB
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_slice[(None, ab_producer_state.count)],
+                        tAsSFA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_slice[(None, ab_producer_state.count)],
+                        tBsSFB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            # Synchronize MMA + epilogue warps to ensure alloc completed and pointer written
+            self.tmem_alloc_barrier.arrive_and_wait()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator/SFA/SFB tensor
+            #
+            acc_tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # Make accumulator tmem tensor
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+            #
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_ab_stage)
+            acc_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_acc_stage)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                # Set tensor memory buffer for current tile
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
+
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_tile in range(k_tile_cnt):
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer (only first epilogue warp performs the alloc)
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.alloc_tmem(
+                    self.num_tmem_alloc_cols,
+                    tmem_holding_buf,
+                    is_two_cta=use_2cta_instrs,
+                )
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            # Synchronize MMA + epilogue warps to make TMEM pointer visible
+            #
+            self.tmem_alloc_barrier.arrive_and_wait()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc,
+            ) = self.epilog_tmem_copy_and_partition(epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs)
+
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(tiled_copy_t2r, tTR_rC, epi_tidx, sC)
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, epi_tile, sC)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_acc_stage)
+
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_consumer_state.index)]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+
+                # Initialize thread-local amax accumulator for this tile
+                # Use 0.0 as initial value since we're computing absolute maximum
+                thread_tile_amax = cutlass.Float32(0.0)
+
+                for subtile_idx in cutlass.range(subtile_cnt):
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    # Accumulate thread-level amax across all subtiles in this tile
+                    # Note: We need absolute value maximum, so take abs first
+                    acc_values = tTR_rAcc.load()
+                    # Apply element-wise absolute value using math.absf (supports vectors)
+                    abs_acc_values_ir = math.absf(acc_values.ir_value())  # operand (positional)
+                    abs_acc_values = type(acc_values)(abs_acc_values_ir, acc_values.shape, acc_values.dtype)
+                    subtile_amax = abs_acc_values.reduce(
+                        cute.ReductionOp.MAX,
+                        cutlass.Float32(0.0),
+                        0,  # Use 0.0 as init for abs values
+                    )
+                    thread_tile_amax = cute.arch.fmax(thread_tile_amax, subtile_amax)
+
+                    #
+                    # Convert to C type
+                    #
+                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                    acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                    tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    c_buffer = (num_prev_subtiles + subtile_idx) % self.num_c_stage
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                # Perform amax reduction after all subtiles are processed
+                _val_i32 = llvm.bitcast(T.i32(), thread_tile_amax.ir_value(), loc=None, ip=None)
+                _res_i32 = nvvm.redux_sync(
+                    res=T.i32(),
+                    val=_val_i32,
+                    kind=nvvm.ReduxKind.MAX,
+                    mask_and_clamp=cutlass.Int32(0xFFFFFFFF).ir_value(),
+                    loc=None,
+                    ip=None,
+                )
+                warp_amax = cutlass.Float32(llvm.bitcast(T.f32(), _res_i32, loc=None, ip=None))
+
+                # Each epilogue warp's lane 0 writes warp amax to shared memory
+                if cute.arch.lane_idx() == 0:
+                    sAmax[warp_idx] = cutlass.Float32(warp_amax)
+
+                # Ensure all epilogue warps complete their writes before block reduction
+                self.epilog_sync_barrier.arrive_and_wait()
+
+                # Block-level reduction: only first epilogue warp's lane 0 handles this
+                if warp_idx == self.epilog_warp_id[0] and cute.arch.lane_idx() == 0:
+                    block_amax = cutlass.Float32(0.0)  # Initial value for absolute maximum
+                    for i in cutlass.range(self.num_epilog_warps):
+                        warp_amax_val = sAmax[i]
+                        block_amax = cute.arch.fmax(block_amax, warp_amax_val)
+
+                    # Global atomic max (accumulates across all tiles for final tensor amax)
+                    # Since we compute absolute values, all values are non-negative
+                    _value_int = llvm.bitcast(T.i32(), block_amax.ir_value(), loc=None, ip=None)
+                    _old_value_int = nvvm.atomicrmw(
+                        res=T.i32(),
+                        op=nvvm.AtomicOpKind.MAX,
+                        ptr=mAmax.iterator.llvm_ptr,
+                        a=_value_int,
+                        loc=None,
+                        ip=None,
+                    )
+                    _ = cutlass.Float32(llvm.bitcast(T.f32(), _old_value_int, loc=None, ip=None))
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    acc_pipeline.consumer_release(acc_consumer_state)
+                acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            # Ensure all epilogue warps complete their writes before block reduction
+            self.epilog_sync_barrier.arrive_and_wait()
+
+            # Coordinate dealloc across 2-CTA instruction mode
+            if use_2cta_instrs:
+                cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1)
+                cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+
+            # Actually deallocate TMEM
+            cute.arch.dealloc_tmem(
+                acc_tmem_ptr,
+                self.num_tmem_alloc_cols,
+                is_two_cta=use_2cta_instrs,
+            )
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_mnl_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_rmem_tensor(tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r)
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
+            - tma_atom_c: The TMA copy atom
+            - bSG_sC: The partitioned shared memory tensor C
+            - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, RestM, RestN, RestL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of Scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Scale factor vector size.
+        :type sf_vec_size: int
+        :param smem_capacity: Total available shared memory capacity in bytes.
+        :type smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, SFA, SFB and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B/SFA/SFB stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B/SFA/SFB stage
+        num_ab_stage = (smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B/SFA/SFB stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (smem_capacity - occupancy * ab_bytes_per_stage * num_ab_stage - occupancy * (mbar_helpers_bytes + c_bytes)) // (
+            occupancy * c_bytes_per_stage
+        )
+
+        return num_acc_stage, num_ab_stage, num_c_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl)
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(tile_sched_params, max_active_clusters)
+
+        return tile_sched_params, grid
+
+
+class Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+    """Wrapper around Sm100BlockScaledPersistentDenseGemmKernel that avoids DLPack.
+
+    This wrapper constructs cute.Tensors directly from cute.Pointer, shapes, and
+    explicit layout orders for operands A, B, SFA, SFB, C. The amax tensor is never fp8 or fp4 so is safe to use directly with dlpack.
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ):
+        self.kernel = Sm100BlockScaledPersistentDenseGemmKernel(
+            sf_vec_size=sf_vec_size,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_ptr: cute.Pointer,
+        a_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        a_order: cutlass.Constexpr[Tuple[int, int, int]],
+        b_ptr: cute.Pointer,
+        b_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        b_order: cutlass.Constexpr[Tuple[int, int, int]],
+        sfa_ptr: cute.Pointer,
+        sfa_shape: cutlass.Constexpr[Tuple[int, ...]],
+        sfa_order: cutlass.Constexpr[Tuple[int, ...]],
+        sfb_ptr: cute.Pointer,
+        sfb_shape: cutlass.Constexpr[Tuple[int, ...]],
+        sfb_order: cutlass.Constexpr[Tuple[int, ...]],
+        c_ptr: cute.Pointer,
+        c_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        c_order: cutlass.Constexpr[Tuple[int, int, int]],
+        amax_cute: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr[int],
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        a_cute = cute.make_tensor(a_ptr, layout=cute.make_ordered_layout(a_shape, order=a_order))
+        b_cute = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout(b_shape, order=b_order))
+        c_cute = cute.make_tensor(c_ptr, layout=cute.make_ordered_layout(c_shape, order=c_order))
+
+        sfa_cute = cute.make_tensor(sfa_ptr, layout=cute.make_ordered_layout(sfa_shape, order=sfa_order))
+        sfb_cute = cute.make_tensor(sfb_ptr, layout=cute.make_ordered_layout(sfb_shape, order=sfb_order))
+        self.kernel(
+            a_cute,
+            b_cute,
+            sfa_cute,
+            sfb_cute,
+            c_cute,
+            amax_cute,
+            max_active_clusters,
+            stream,
+            epilogue_op,
+        )
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/__init__.py b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/__init__.py
new file mode 100644
index 00000000..cf2170c3
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/__init__.py
@@ -0,0 +1,9 @@
+from .api import (
+    GemmSwigluSm100,
+    gemm_swiglu_wrapper_sm100,
+)
+
+__all__ = [
+    "GemmSwigluSm100",
+    "gemm_swiglu_wrapper_sm100",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/api.py b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/api.py
new file mode 100644
index 00000000..d2e4c696
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/api.py
@@ -0,0 +1,940 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+from .dense_gemm_persistent_swiglu import (
+    PersistentDenseGemmKernel,
+    PersistentDenseGemmKernelNoDlpack,
+)
+from .dense_blockscaled_gemm_persistent_swiglu_interleaved_quant import (
+    Sm100BlockScaledPersistentDenseGemmKernel,
+    Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+)
+from cuda.bindings import driver as cuda
+import torch
+from typing import Tuple, Optional
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from packaging import version
+import cutlass.cute.math as math
+
+from cudnn.datatypes import _convert_to_cutlass_data_type
+from cudnn.api_base import APIBase, ceil_div, is_power_of_2
+
+
+class GemmSwigluSm100(APIBase):
+    def __init__(
+        self,
+        sample_a: torch.Tensor,
+        sample_b: torch.Tensor,
+        sample_ab12: torch.Tensor,
+        sample_c: torch.Tensor,
+        alpha: float = 1.0,
+        acc_dtype: torch.dtype = torch.float32,
+        mma_tiler_mn: Tuple[int, int] = (128, 128),
+        cluster_shape_mn: Optional[Tuple[int, int]] = None,
+        ### Quantize only arguments
+        sample_sfa: Optional[torch.Tensor] = None,
+        sample_sfb: Optional[torch.Tensor] = None,
+        sample_amax: Optional[torch.Tensor] = None,
+        sample_sfc: Optional[torch.Tensor] = None,
+        sample_norm_const: Optional[torch.Tensor] = None,
+        sf_vec_size: int = 16,
+        vector_f32: bool = False,
+        ab12_stages: int = 4,
+    ):
+        super().__init__()
+
+        self._logger.warning("GemmSwigluSm100 is an experimental API")
+        self._logger.debug("Entering __init__")
+
+        self.sample_a = sample_a
+        self.sample_b = sample_b
+        self.sample_ab12 = sample_ab12
+        self.sample_c = sample_c
+        self.alpha = alpha
+        self.acc_dtype = acc_dtype
+        self.mma_tiler_mn = mma_tiler_mn
+        if cluster_shape_mn is None:
+            self.cluster_shape_mn = (1, 1) if not self.mma_tiler_mn[0] == 256 else (2, 2)
+        else:
+            self.cluster_shape_mn = cluster_shape_mn
+
+        ### Quantize only arguments
+        self.sample_sfa = sample_sfa
+        self.sample_sfb = sample_sfb
+        self.sample_sfc = sample_sfc
+        self.sample_amax = self._unpad_tensor_to_ndim(sample_amax, 1, "amax")
+        self.sample_norm_const = self._unpad_tensor_to_ndim(sample_norm_const, 1, "norm_const")
+        self.sf_vec_size = sf_vec_size
+        self.vector_f32 = vector_f32
+        self.ab12_stages = ab12_stages
+
+        # Kernel selection
+        if self.sample_sfa is None and self.sample_sfb is None and self.sample_amax is None and self.sample_sfc is None and self.sample_norm_const is None:
+            self._logger.debug("No quantization arguments provided, using regular GEMM swiglu kernel")
+            self._kernel = PersistentDenseGemmKernel
+        else:
+            self._logger.debug("Quantization arguments provided, using quantized GEMM swiglu kernel")
+            self._kernel = Sm100BlockScaledPersistentDenseGemmKernel
+
+        self._logger.debug(
+            f"__init__ completed with args: sample_a {sample_a.shape}, sample_b {sample_b.shape}, sample_ab12 {sample_ab12.shape}, sample_c {sample_c.shape}, alpha {alpha}, acc_dtype {acc_dtype}, mma_tiler_mn {mma_tiler_mn}, cluster_shape_mn {cluster_shape_mn}, sample_sfa {sample_sfa.shape if sample_sfa is not None else None}, sample_sfb {sample_sfb.shape if sample_sfb is not None else None}, sample_amax {sample_amax.shape if sample_amax is not None else None}, sample_sfc {sample_sfc.shape if sample_sfc is not None else None}, sample_norm_const {sample_norm_const.shape if sample_norm_const is not None else None}, sf_vec_size {sf_vec_size}, vector_f32 {vector_f32}, ab12_stages {ab12_stages}"
+        )
+
+        self._interpret_uint8_as_fp4x2 = True
+
+    def check_support(self) -> bool:
+        self._logger.debug("Entering check_support")
+
+        self._logger.debug("Checking tensor shapes, strides, and dtypes")
+        m, k, l = self._tensor_shape(self.sample_a, name="sample_a")
+        n, k, l = self._tensor_shape(self.sample_b, name="sample_b")
+        m, n, l = self._tensor_shape(self.sample_ab12, name="sample_ab12")
+        m, n_2, l = self._tensor_shape(self.sample_c, name="sample_c")
+
+        self._check_tensor_shape(self.sample_a, (m, k, l), "A")
+        self._check_tensor_shape(self.sample_b, (n, k, l), "B")
+        self._check_tensor_shape(self.sample_ab12, (m, n, l), "AB12")
+        self._check_tensor_shape(self.sample_c, (m, n // 2, l), "C")
+
+        if self._kernel in {
+            Sm100BlockScaledPersistentDenseGemmKernel,
+            Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+        }:
+            rest_k = ceil_div(ceil_div(k, self.sf_vec_size), 4)
+            self._check_tensor_shape(self.sample_sfa, (32, 4, ceil_div(m, 128), 4, rest_k, l), "SFA")
+            self._check_tensor_shape(self.sample_sfb, (32, 4, ceil_div(n, 128), 4, rest_k, l), "SFB")
+            self._check_tensor_shape(self.sample_amax, (1,), "amax")
+            rest_n2 = ceil_div(ceil_div(n // 2, self.sf_vec_size), 4)
+            self._check_tensor_shape(self.sample_sfc, (32, 4, ceil_div(m, 128), 4, rest_n2, l), "SFC")
+            self._check_tensor_shape(self.sample_norm_const, (1,), "norm_const")
+
+        _, self.a_stride_order = self._check_tensor_stride(self.sample_a, stride=[(1, m, m * k), (k, 1, m * k)])
+        _, self.b_stride_order = self._check_tensor_stride(self.sample_b, stride=[(1, n, n * k), (k, 1, n * k)])
+        _, self.ab12_stride_order = self._check_tensor_stride(self.sample_ab12, stride=[(1, m, m * n), (n, 1, m * n)])
+        _, self.c_stride_order = self._check_tensor_stride(self.sample_c, stride=[(1, m, m * n_2), (n_2, 1, m * n_2)])
+        self._value_error_if(
+            self.ab12_stride_order != self.c_stride_order,
+            f"AB12 and C tensor stride orders must match, got {self.ab12_stride_order} and {self.c_stride_order}",
+        )
+
+        self._logger.debug("Checking data types")
+        if self._kernel in {
+            PersistentDenseGemmKernel,
+            PersistentDenseGemmKernelNoDlpack,
+        }:
+            self.ab_dtype = self._check_dtype(
+                self.sample_a,
+                dtype=[
+                    torch.float16,
+                    torch.bfloat16,
+                    torch.float32,
+                    torch.float8_e4m3fn,
+                    torch.float8_e5m2,
+                ],
+                name="A",
+            )
+            match self.acc_dtype:
+                case torch.float32:
+                    self.ab12_dtype = self._check_dtype(
+                        self.sample_ab12,
+                        dtype=[
+                            torch.float32,
+                            torch.float16,
+                            torch.bfloat16,
+                            torch.float8_e4m3fn,
+                            torch.float8_e5m2,
+                        ],
+                        name="AB12 (for float32 acc_dtype)",
+                    )
+                    self._not_implemented_error_if(
+                        self._is_fp8(self.ab12_dtype),
+                        f"ab12_dtype {{torch.float8_e5m2, torch.float8_e4m3fn}} is currently disabled",
+                    )
+                case torch.float16:
+                    self.ab12_dtype = self._check_dtype(
+                        self.sample_ab12,
+                        dtype=[torch.float16, torch.bfloat16],
+                        name="AB12 (for float16 acc_dtype)",
+                    )
+                    self._check_dtype(
+                        self.ab_dtype,
+                        dtype=[torch.float16, torch.float8_e4m3fn, torch.float8_e5m2],
+                        name="A/B (for float16 acc_dtype)",
+                    )
+                case _:
+                    raise ValueError(f"Unsupported acc_dtype: expected one of {{torch.float32, torch.float16}}, got {self.acc_dtype}")
+            self.c_dtype = self._check_dtype(self.sample_c, dtype=[torch.float16, torch.bfloat16], name="C")
+        elif self._kernel in {
+            Sm100BlockScaledPersistentDenseGemmKernel,
+            Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+        }:
+            self._value_error_if(
+                self.sample_sfa is None or self.sample_sfb is None,
+                "sfa and sfb must be provided for quantized GEMM swiglu kernel",
+            )
+
+            self.ab_dtype = self._check_dtype(
+                self.sample_a,
+                dtype=[
+                    torch.float4_e2m1fn_x2,
+                    torch.uint8,
+                    torch.float8_e5m2,
+                    torch.float8_e4m3fn,
+                ],
+                name="A (for quantized GEMM swiglu kernel)",
+            )
+            self.acc_dtype = self._check_dtype(
+                self.acc_dtype,
+                dtype=torch.float32,
+                name="Accumulator (for quantized GEMM swiglu kernel)",
+            )
+            self.ab12_dtype = self._check_dtype(
+                self.sample_ab12,
+                dtype=[
+                    torch.float32,
+                    torch.float16,
+                    torch.bfloat16,
+                    torch.float8_e4m3fn,
+                    torch.float8_e5m2,
+                ],
+                name="AB12 (for quantized GEMM swiglu kernel)",
+            )
+            self.c_dtype = self._check_dtype(
+                self.sample_c,
+                dtype=[
+                    torch.float32,
+                    torch.float16,
+                    torch.bfloat16,
+                    torch.float8_e4m3fn,
+                    torch.float8_e5m2,
+                ],
+                name="C (for quantized GEMM swiglu kernel)",
+            )
+
+            self._value_error_if(
+                self._is_fp4x2(self.ab_dtype) and self._is_fp8(self.c_dtype),
+                "Invalid dtype combination: fp4 ab_dtype is not compatible with fp8 c_dtype (recommended bf16)",
+            )
+
+            self._value_error_if(
+                self._is_fp8(self.c_dtype) and (self.sample_sfc is None or self.sample_norm_const is None),
+                "sfc and norm_const must be provided when c_dtype is fp8",
+            )
+            self._value_error_if(
+                (self._is_fp4x2(self.ab_dtype) and self.c_dtype == torch.bfloat16) and (self.sample_amax is None),
+                "amax must be provided when ab_dtype is fp4 and c_dtype is bf16",
+            )
+
+            self._not_implemented_error_if(
+                self.c_dtype == torch.float32 and self.ab12_dtype == torch.float32,
+                "float32 c_dtype and float32 ab12_dtype currently disabled due to kernel bug",
+            )
+
+            self._value_error_if(
+                self.sf_vec_size not in {16, 32},
+                f"sf_vec_size must be 16 or 32 when ab_dtype is {{torch.float8_e5m2, torch.float8_e4m3fn}}, got {self.sf_vec_size}",
+            )
+            self.sf_dtype = self._check_dtype(
+                self.sample_sfa,
+                dtype=[torch.float8_e8m0fnu, torch.float8_e4m3fn],
+                name="SFA",
+            )
+            self._check_dtype(
+                self.sample_sfb,
+                dtype=self.sf_dtype,
+                name="SFB",
+                extra_error_msg="SFB must have the same dtype as SFA",
+            )
+            self._check_dtype(
+                self.sample_sfc,
+                dtype=self.sf_dtype,
+                name="SFC",
+                extra_error_msg="SFC must have the same dtype as SFA",
+            )
+            if self._is_fp8(self.ab_dtype):
+                self._value_error_if(
+                    not (self.sf_dtype == torch.float8_e8m0fnu and self.sf_vec_size == 32),
+                    "Invalid ab_dtype and sf_dtype/sf_vec_size combination: fp8 ab_dtype requires float8_e8m0fnu sf_dtype and 32 sf_vec_size",
+                )
+            elif self._is_fp4x2(self.ab_dtype):
+                self._value_error_if(
+                    self.sf_dtype == torch.float8_e4m3fn and self.sf_vec_size == 32,
+                    "Invalid ab_dtype and sf_dtype/sf_vec_size combination: fp4 ab_dtype not supported with float8_e4m3fn sf_dtype and 32 sf_vec_size",
+                )
+
+            if self._is_fp4x2(self.ab_dtype):
+                self._value_error_if(
+                    self.a_stride_order != (1, 0, 2) or self.b_stride_order != (1, 0, 2),
+                    "Invalid A or B tensor stride: fp4 dtype requires k-major layout",
+                )
+                self._value_error_if(
+                    self.ab12_stride_order != (1, 0, 2),
+                    "Invalid AB12 tensor stride: fp4 dtype requires n-major layout",
+                )
+        self._check_dtype(
+            self.sample_b,
+            dtype=self.ab_dtype,
+            name="B",
+            extra_error_msg="A and B must have the same dtype",
+        )
+
+        self._logger.debug("Checking MMA tile shape and cluster shape")
+
+        self._value_error_if(
+            self.mma_tiler_mn[0] not in [128, 256],
+            f"Invalid MMA tile shape: expected mma_tiler_mn[0] in {{128, 256}}, got {self.mma_tiler_mn[0]}",
+        )
+        if self._kernel in {
+            PersistentDenseGemmKernel,
+            PersistentDenseGemmKernelNoDlpack,
+        }:
+            self._value_error_if(
+                self.mma_tiler_mn[1] not in range(32, 257, 32),
+                f"Invalid MMA tile shape: expected mma_tiler_mn[1] in {{32, 64, ..., 224, 256}}, got {self.mma_tiler_mn[1]}",
+            )
+
+        elif self._kernel in {
+            Sm100BlockScaledPersistentDenseGemmKernel,
+            Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+        }:
+            if self._is_fp4x2(self.ab_dtype):
+                self._value_error_if(
+                    self.mma_tiler_mn[1] not in range(64, 257, 64),
+                    f"Invalid MMA tile shape: expected mma_tiler_mn[1] in {{64, 128, 192, 256}}, got {self.mma_tiler_mn[1]}",
+                )
+            else:
+                if self._is_fp8(self.ab_dtype):
+                    self._value_error_if(
+                        self._is_fp8(self.c_dtype) or self._is_fp8(self.ab12_dtype) or self.ab12_dtype == torch.float32,
+                        "For MXFP8 inputs for blockscaled quantized GEMM swiglu kernel, ab12_dtype and c_dtype cannot be FP8. ab12_dtype also cannot be float32",
+                    )
+
+        self._value_error_if(
+            self.cluster_shape_mn[0] % (2 if self.mma_tiler_mn[0] == 256 else 1) != 0,
+            "Invalid cluster shape: cluster_shape_mn[0] must be divisible by 2 if mma_tiler_mn[0] == 256",
+        )
+        self._value_error_if(
+            not (
+                self.cluster_shape_mn[0] * self.cluster_shape_mn[1] <= 16
+                and self.cluster_shape_mn[0] > 0
+                and self.cluster_shape_mn[1] > 0
+                and is_power_of_2(self.cluster_shape_mn[0])
+                and is_power_of_2(self.cluster_shape_mn[1])
+            ),
+            f"Invalid cluster shape: expected values to be powers of 2 and cluster_shape_mn[0] * cluster_shape_mn[1] <= 16, got {self.cluster_shape_mn[0]},{self.cluster_shape_mn[1]}",
+        )
+
+        if self._kernel in {
+            PersistentDenseGemmKernel,
+            PersistentDenseGemmKernelNoDlpack,
+        }:
+            use_2cta_instrs = self.mma_tiler_mn[0] == 256
+            self._value_error_if(
+                not use_2cta_instrs and self.cluster_shape_mn != (1, 1),
+                "Invalid cluster shape: cluster_shape must be (1, 1) when use_2cta_instrs=False",
+            )
+            if self.cluster_shape_mn != (1, 1) and self.mma_tiler_mn[0] == 128:
+                self._value_error_if(
+                    self.mma_tiler_mn != (128, 128),
+                    "Invalid MMA tile shape: for non-1x1 cluster shape and 128xmma tile shape, mma_tiler_mn must be (128, 128)",
+                )
+
+        self._logger.debug("Checking tensor alignment")
+
+        def check_contigous_16B_alignment(dtype, stride_order, tensor_shape):
+            is_mode0_major = stride_order == (0, 1, 2)
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // (_convert_to_cutlass_data_type(dtype, interpret_uint8_as_fp4x2=self._interpret_uint8_as_fp4x2).width)
+            return num_major_elements % num_contiguous_elements == 0
+
+        self._value_error_if(
+            not (
+                check_contigous_16B_alignment(self.ab_dtype, self.a_stride_order, (m, k, l))
+                and check_contigous_16B_alignment(self.ab_dtype, self.b_stride_order, (n, k, l))
+                and check_contigous_16B_alignment(self.ab12_dtype, self.ab12_stride_order, (m, n, l))
+            ),
+            "Invalid tensor alignment: tensors must be 16B aligned",
+        )
+
+        if self._kernel in {
+            Sm100BlockScaledPersistentDenseGemmKernel,
+            Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+        }:
+            self._value_error_if(
+                m % self.mma_tiler_mn[0] != 0 or n % self.mma_tiler_mn[1] != 0,
+                "Invalid tensor alignment: m and n must be aligned to mma_tiler_mn",
+            )
+
+        self._logger.debug("Checking environment")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        device = torch.cuda.current_device()
+        major, minor = torch.cuda.get_device_capability(device)
+        compute_capability = major * 10 + minor
+        if compute_capability < 100:
+            raise RuntimeError(f"GemmSwiglu requires SM100+ compute capability, but found SM{compute_capability} on device {device}")
+        if compute_capability == 103:
+            raise RuntimeError("cuteDSL GemmSwiglu is not supported on SM103")
+
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        self._logger.debug("Entering compile")
+        current_stream = self._get_default_stream(current_stream)
+        self._ensure_support_checked()
+
+        torch_version = version.parse(torch.__version__)
+        is_ab_fp4 = self._is_fp4x2(self.ab_dtype)
+        is_ab12_fp4 = self._is_fp4x2(self.ab12_dtype)
+        is_ab_fp8 = self._is_fp8(self.ab_dtype)
+        is_ab12_fp8 = self._is_fp8(self.ab12_dtype)
+        _fp8_dlpack_supported = version.parse(torch_version.base_version) >= version.parse("2.10.0")
+        use_no_dlpack_kernel = is_ab_fp4 or is_ab12_fp4 or ((is_ab_fp8 or is_ab12_fp8) and not _fp8_dlpack_supported)
+
+        if use_no_dlpack_kernel:
+            self._logger.debug("Running no_dlpack kernel wrapper due to fp4 dtype or fp8 dtype on incompatible torch version")
+            if self._kernel is PersistentDenseGemmKernel:
+                self._kernel = PersistentDenseGemmKernelNoDlpack
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+                self._kernel = Sm100BlockScaledPersistentDenseGemmKernelNoDlpack
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+
+        gemm_swiglu = None
+        if self._kernel in (
+            PersistentDenseGemmKernel,
+            PersistentDenseGemmKernelNoDlpack,
+        ):
+            gemm_swiglu = self._kernel(
+                acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+                use_2cta_instrs=(self.mma_tiler_mn[0] == 256),
+                mma_tiler_mn=self.mma_tiler_mn,
+                cluster_shape_mn=self.cluster_shape_mn,
+            )
+        elif self._kernel in (
+            Sm100BlockScaledPersistentDenseGemmKernel,
+            Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+        ):
+            gemm_swiglu = self._kernel(
+                sf_vec_size=self.sf_vec_size,
+                mma_tiler_mn=self.mma_tiler_mn,
+                cluster_shape_mn=self.cluster_shape_mn,
+                vector_f32=self.vector_f32,
+                ab12_stages=self.ab12_stages,
+            )
+        else:
+            raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+
+        hardware_info = cutlass.utils.HardwareInfo()
+        max_active_clusters = hardware_info.get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1])
+
+        if self._kernel is PersistentDenseGemmKernel:
+            self._logger.debug("Compiling gemm_swiglu (dlpack)")
+            self._compiled_kernel = cute.compile(
+                gemm_swiglu,
+                a=from_dlpack(self.sample_a),
+                b=from_dlpack(self.sample_b),
+                ab12=from_dlpack(self.sample_ab12),
+                c=from_dlpack(self.sample_c),
+                alpha=self.alpha,
+                max_active_clusters=max_active_clusters,
+                stream=current_stream,
+            )
+        elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+            self._logger.debug("Compiling gemm_swiglu_blockscaled_quantized (dlpack)")
+            self._compiled_kernel = cute.compile(
+                gemm_swiglu,
+                a_tensor=from_dlpack(self.sample_a, assumed_align=16),
+                b_tensor=from_dlpack(self.sample_b, assumed_align=16),
+                sfa_tensor=from_dlpack(self.sample_sfa, assumed_align=16),
+                sfb_tensor=from_dlpack(self.sample_sfb, assumed_align=16),
+                c_tensor=from_dlpack(self.sample_c, assumed_align=16),
+                ab12_tensor=from_dlpack(self.sample_ab12, assumed_align=8),
+                amax_tensor=(from_dlpack(self.sample_amax, assumed_align=16) if self.sample_amax is not None else None),
+                sfc_tensor=(from_dlpack(self.sample_sfc, assumed_align=16) if self.sample_sfc is not None else None),
+                norm_const_tensor=(from_dlpack(self.sample_norm_const) if self.sample_norm_const is not None else None),
+                alpha=self.alpha,
+                max_active_clusters=max_active_clusters,
+                stream=current_stream,
+            )
+        elif self._kernel in (
+            PersistentDenseGemmKernelNoDlpack,
+            Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+        ):
+            # Create cute pointers/tensors manually to avoid DLPack requirements
+            # c (output) is always fp16/bf16 and is safe to use directly with dlpack
+            a_ptr, a_shape, a_stride_order = self._make_cute_tensor_descriptor(self.sample_a, name="A")
+            b_ptr, b_shape, b_stride_order = self._make_cute_tensor_descriptor(self.sample_b, name="B")
+            ab12_ptr, ab12_shape, ab12_stride_order = self._make_cute_tensor_descriptor(self.sample_ab12, name="AB12")
+
+            if self._kernel is PersistentDenseGemmKernelNoDlpack:
+                self._compiled_kernel = cute.compile(
+                    gemm_swiglu,
+                    a_ptr=a_ptr,
+                    a_shape=a_shape,
+                    a_order=a_stride_order,
+                    b_ptr=b_ptr,
+                    b_shape=b_shape,
+                    b_order=b_stride_order,
+                    ab12_ptr=ab12_ptr,
+                    ab12_shape=ab12_shape,
+                    ab12_order=ab12_stride_order,
+                    c_cute=from_dlpack(self.sample_c),
+                    alpha=self.alpha,
+                    max_active_clusters=max_active_clusters,
+                    stream=current_stream,
+                )
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+                c_ptr, c_shape, c_stride_order = self._make_cute_tensor_descriptor(self.sample_c, name="C")
+                sfa_ptr, sfa_shape, sfa_stride_order = self._make_cute_tensor_descriptor(self.sample_sfa, name="SFA")
+                sfb_ptr, sfb_shape, sfb_stride_order = self._make_cute_tensor_descriptor(self.sample_sfb, name="SFB")
+                amax_ptr, amax_shape, amax_stride_order = self._make_cute_tensor_descriptor(self.sample_amax, name="AMAX")
+                sfc_ptr, sfc_shape, sfc_stride_order = self._make_cute_tensor_descriptor(self.sample_sfc, name="SFC")
+                norm_const_ptr, norm_const_shape, norm_const_stride_order = self._make_cute_tensor_descriptor(self.sample_norm_const, name="NORM_CONST")
+
+                self._compiled_kernel = cute.compile(
+                    gemm_swiglu,
+                    a_ptr=a_ptr,
+                    a_shape=a_shape,
+                    a_order=a_stride_order,
+                    b_ptr=b_ptr,
+                    b_shape=b_shape,
+                    b_order=b_stride_order,
+                    sfa_ptr=sfa_ptr,
+                    sfa_shape=sfa_shape,
+                    sfa_order=sfa_stride_order,
+                    sfb_ptr=sfb_ptr,
+                    sfb_shape=sfb_shape,
+                    sfb_order=sfb_stride_order,
+                    c_ptr=c_ptr,
+                    c_shape=c_shape,
+                    c_order=c_stride_order,
+                    ab12_ptr=ab12_ptr,
+                    ab12_shape=ab12_shape,
+                    ab12_order=ab12_stride_order,
+                    amax_ptr=amax_ptr,
+                    amax_shape=amax_shape,
+                    amax_order=amax_stride_order,
+                    sfc_ptr=sfc_ptr,
+                    sfc_shape=sfc_shape,
+                    sfc_order=sfc_stride_order,
+                    norm_const_ptr=norm_const_ptr,
+                    norm_const_shape=norm_const_shape,
+                    norm_const_order=norm_const_stride_order,
+                    alpha=self.alpha,
+                    max_active_clusters=max_active_clusters,
+                    stream=current_stream,
+                )
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+        else:
+            raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+        self._logger.debug("Kernel compiled successfully")
+
+    def execute(
+        self,
+        a_tensor: torch.Tensor,
+        b_tensor: torch.Tensor,
+        ab12_tensor: torch.Tensor,
+        c_tensor: torch.Tensor,
+        sfa_tensor: Optional[torch.Tensor] = None,
+        sfb_tensor: Optional[torch.Tensor] = None,
+        amax_tensor: Optional[torch.Tensor] = None,
+        sfc_tensor: Optional[torch.Tensor] = None,
+        norm_const_tensor: Optional[torch.Tensor] = None,
+        alpha: float = 1.0,
+        current_stream: Optional[cuda.CUstream] = None,
+        skip_compile: bool = False,
+    ) -> None:
+        self._logger.debug("Entering execute")
+        current_stream = self._get_default_stream(current_stream)
+
+        if not skip_compile:
+            self._runtime_error_if(
+                self._compiled_kernel is None,
+                "GemmSwigluSm100 kernel not compiled; call compile() first or use execute(skip_compile=True)",
+            )
+            self._logger.debug("Executing with compiled kernel")
+
+            if self._kernel is PersistentDenseGemmKernel:
+                self._compiled_kernel(
+                    a=from_dlpack(a_tensor),
+                    b=from_dlpack(b_tensor),
+                    ab12=from_dlpack(ab12_tensor),
+                    c=from_dlpack(c_tensor),
+                    alpha=alpha,
+                    stream=current_stream,
+                )
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+                amax_tensor = self._unpad_tensor_to_ndim(amax_tensor, 1, "amax")
+                norm_const_tensor = self._unpad_tensor_to_ndim(norm_const_tensor, 1, "norm_const")
+                self._compiled_kernel(
+                    a_tensor=from_dlpack(a_tensor, assumed_align=16),
+                    b_tensor=from_dlpack(b_tensor, assumed_align=16),
+                    sfa_tensor=from_dlpack(sfa_tensor, assumed_align=16),
+                    sfb_tensor=from_dlpack(sfb_tensor, assumed_align=16),
+                    c_tensor=from_dlpack(c_tensor, assumed_align=16),
+                    ab12_tensor=from_dlpack(ab12_tensor, assumed_align=8),
+                    amax_tensor=(from_dlpack(amax_tensor, assumed_align=16) if amax_tensor is not None else None),
+                    sfc_tensor=(from_dlpack(sfc_tensor, assumed_align=16) if sfc_tensor is not None else None),
+                    norm_const_tensor=(from_dlpack(norm_const_tensor) if norm_const_tensor is not None else None),
+                    alpha=alpha,
+                    stream=current_stream,
+                )
+            elif self._kernel in (
+                PersistentDenseGemmKernelNoDlpack,
+                Sm100BlockScaledPersistentDenseGemmKernelNoDlpack,
+            ):
+                a_ptr = self._make_cute_pointer(a_tensor, assumed_align=16)
+                b_ptr = self._make_cute_pointer(b_tensor, assumed_align=16)
+                ab12_ptr = self._make_cute_pointer(ab12_tensor, assumed_align=16)
+
+                if self._kernel is PersistentDenseGemmKernelNoDlpack:
+                    self._compiled_kernel(
+                        a_ptr=a_ptr,
+                        b_ptr=b_ptr,
+                        ab12_ptr=ab12_ptr,
+                        c_cute=from_dlpack(c_tensor),
+                        alpha=alpha,
+                        stream=current_stream,
+                    )
+                elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+                    amax_tensor = self._unpad_tensor_to_ndim(amax_tensor, 1, "amax")
+                    norm_const_tensor = self._unpad_tensor_to_ndim(norm_const_tensor, 1, "norm_const")
+                    c_ptr = self._make_cute_pointer(c_tensor, assumed_align=16)
+                    sfa_ptr = self._make_cute_pointer(sfa_tensor, assumed_align=16)
+                    sfb_ptr = self._make_cute_pointer(sfb_tensor, assumed_align=16)
+                    amax_ptr = self._make_cute_pointer(amax_tensor, assumed_align=16)
+                    sfc_ptr = self._make_cute_pointer(sfc_tensor, assumed_align=16)
+                    norm_const_ptr = self._make_cute_pointer(norm_const_tensor)
+                    self._compiled_kernel(
+                        a_ptr=a_ptr,
+                        b_ptr=b_ptr,
+                        sfa_ptr=sfa_ptr,
+                        sfb_ptr=sfb_ptr,
+                        c_ptr=c_ptr,
+                        ab12_ptr=ab12_ptr,
+                        amax_ptr=amax_ptr,
+                        sfc_ptr=sfc_ptr,
+                        norm_const_ptr=norm_const_ptr,
+                        alpha=alpha,
+                        stream=current_stream,
+                    )
+                else:
+                    raise NotImplementedError(f"Unreachable: invalid kernel type {type(self._compiled_kernel)}")
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {type(self._compiled_kernel)}")
+            self._logger.debug("Executed with compiled kernel successfully")
+        else:  # skip_compile
+            self._logger.debug("Executing without compiled kernel (JIT)")
+
+            if self._kernel is PersistentDenseGemmKernel:
+                gemm_swiglu = self._kernel(
+                    acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+                    use_2cta_instrs=(self.mma_tiler_mn[0] == 256),
+                    mma_tiler_mn=self.mma_tiler_mn,
+                    cluster_shape_mn=self.cluster_shape_mn,
+                )
+                gemm_swiglu(
+                    a=from_dlpack(a_tensor),
+                    b=from_dlpack(b_tensor),
+                    ab12=from_dlpack(ab12_tensor),
+                    c=from_dlpack(c_tensor),
+                    alpha=alpha,
+                    max_active_clusters=cutlass.utils.HardwareInfo().get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1]),
+                    stream=current_stream,
+                )
+            elif self._kernel is PersistentDenseGemmKernelNoDlpack:
+                gemm_swiglu = self._kernel(
+                    acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+                    use_2cta_instrs=(self.mma_tiler_mn[0] == 256),
+                    mma_tiler_mn=self.mma_tiler_mn,
+                    cluster_shape_mn=self.cluster_shape_mn,
+                )
+
+                a_ptr, a_shape, a_stride_order = self._make_cute_tensor_descriptor(a_tensor, name="A")
+                b_ptr, b_shape, b_stride_order = self._make_cute_tensor_descriptor(b_tensor, name="B")
+                ab12_ptr, ab12_shape, ab12_stride_order = self._make_cute_tensor_descriptor(ab12_tensor, name="AB12")
+
+                gemm_swiglu(
+                    a_ptr=a_ptr,
+                    a_shape=a_shape,
+                    a_order=a_stride_order,
+                    b_ptr=b_ptr,
+                    b_shape=b_shape,
+                    b_order=b_stride_order,
+                    ab12_ptr=ab12_ptr,
+                    ab12_shape=ab12_shape,
+                    ab12_order=ab12_stride_order,
+                    c_cute=from_dlpack(c_tensor),
+                    alpha=alpha,
+                    max_active_clusters=cutlass.utils.HardwareInfo().get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1]),
+                    stream=current_stream,
+                )
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernel:
+                gemm_swiglu = self._kernel(
+                    sf_vec_size=self.sf_vec_size,
+                    mma_tiler_mn=self.mma_tiler_mn,
+                    cluster_shape_mn=self.cluster_shape_mn,
+                    vector_f32=self.vector_f32,
+                    ab12_stages=self.ab12_stages,
+                )
+                amax_tensor = self._unpad_tensor_to_ndim(amax_tensor, 1, "amax")
+                norm_const_tensor = self._unpad_tensor_to_ndim(norm_const_tensor, 1, "norm_const")
+                gemm_swiglu(
+                    a_tensor=from_dlpack(a_tensor, assumed_align=16),
+                    b_tensor=from_dlpack(b_tensor, assumed_align=16),
+                    sfa_tensor=from_dlpack(sfa_tensor, assumed_align=16),
+                    sfb_tensor=from_dlpack(sfb_tensor, assumed_align=16),
+                    c_tensor=from_dlpack(c_tensor, assumed_align=16),
+                    ab12_tensor=from_dlpack(ab12_tensor, assumed_align=8),
+                    amax_tensor=(from_dlpack(amax_tensor, assumed_align=16) if amax_tensor is not None else None),
+                    sfc_tensor=(from_dlpack(sfc_tensor, assumed_align=16) if sfc_tensor is not None else None),
+                    norm_const_tensor=(from_dlpack(norm_const_tensor) if norm_const_tensor is not None else None),
+                    alpha=alpha,
+                    max_active_clusters=cutlass.utils.HardwareInfo().get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1]),
+                    stream=current_stream,
+                )
+            elif self._kernel is Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+                gemm_swiglu = self._kernel(
+                    sf_vec_size=self.sf_vec_size,
+                    mma_tiler_mn=self.mma_tiler_mn,
+                    cluster_shape_mn=self.cluster_shape_mn,
+                    vector_f32=self.vector_f32,
+                    ab12_stages=self.ab12_stages,
+                )
+                amax_tensor = self._unpad_tensor_to_ndim(amax_tensor, 1, "amax")
+                norm_const_tensor = self._unpad_tensor_to_ndim(norm_const_tensor, 1, "norm_const")
+
+                a_ptr, a_shape, a_stride_order = self._make_cute_tensor_descriptor(a_tensor, name="A")
+                b_ptr, b_shape, b_stride_order = self._make_cute_tensor_descriptor(b_tensor, name="B")
+                ab12_ptr, ab12_shape, ab12_stride_order = self._make_cute_tensor_descriptor(ab12_tensor, name="AB12")
+                c_ptr, c_shape, c_stride_order = self._make_cute_tensor_descriptor(c_tensor, name="C")
+                sfa_ptr, sfa_shape, sfa_stride_order = self._make_cute_tensor_descriptor(sfa_tensor, name="SFA")
+                sfb_ptr, sfb_shape, sfb_stride_order = self._make_cute_tensor_descriptor(sfb_tensor, name="SFB")
+                amax_ptr, amax_shape, amax_stride_order = self._make_cute_tensor_descriptor(amax_tensor, name="AMAX")
+                sfc_ptr, sfc_shape, sfc_stride_order = self._make_cute_tensor_descriptor(sfc_tensor, name="SFC")
+                norm_const_ptr, norm_const_shape, norm_const_stride_order = self._make_cute_tensor_descriptor(norm_const_tensor, name="NORM_CONST")
+
+                gemm_swiglu(
+                    a_ptr=a_ptr,
+                    a_shape=a_shape,
+                    a_order=a_stride_order,
+                    b_ptr=b_ptr,
+                    b_shape=b_shape,
+                    b_order=b_stride_order,
+                    sfa_ptr=sfa_ptr,
+                    sfa_shape=sfa_shape,
+                    sfa_order=sfa_stride_order,
+                    sfb_ptr=sfb_ptr,
+                    sfb_shape=sfb_shape,
+                    sfb_order=sfb_stride_order,
+                    c_ptr=c_ptr,
+                    c_shape=c_shape,
+                    c_order=c_stride_order,
+                    ab12_ptr=ab12_ptr,
+                    ab12_shape=ab12_shape,
+                    ab12_order=ab12_stride_order,
+                    amax_ptr=amax_ptr,
+                    amax_shape=amax_shape,
+                    amax_order=amax_stride_order,
+                    sfc_ptr=sfc_ptr,
+                    sfc_shape=sfc_shape,
+                    sfc_order=sfc_stride_order,
+                    norm_const_ptr=norm_const_ptr,
+                    norm_const_shape=norm_const_shape,
+                    norm_const_order=norm_const_stride_order,
+                    alpha=alpha,
+                    max_active_clusters=cutlass.utils.HardwareInfo().get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1]),
+                    stream=current_stream,
+                )
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {type(self._kernel)}")
+            self._logger.debug("Executed without compiled kernel (JIT) successfully")
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_GemmSwigluSm100Objects = {}
+
+
+def gemm_swiglu_wrapper_sm100(
+    a_tensor: torch.Tensor,
+    b_tensor: torch.Tensor,
+    alpha: float = 1.0,
+    c_major: str = "n",
+    ab12_dtype: torch.dtype = torch.float32,
+    c_dtype: torch.dtype = torch.float16,
+    acc_dtype: torch.dtype = torch.float32,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    cluster_shape_mn: Optional[Tuple[int, int]] = None,
+    ### Quantize only arguments
+    sfa_tensor: Optional[torch.Tensor] = None,
+    sfb_tensor: Optional[torch.Tensor] = None,
+    norm_const_tensor: Optional[torch.Tensor] = None,
+    sf_vec_size: int = 16,
+    vector_f32: bool = False,
+    ab12_stages: int = 4,
+    stream: Optional[cuda.CUstream] = None,
+) -> Tuple[torch.Tensor, ...]:
+
+    _logger.debug("gemm_swiglu_wrapper_sm100: Creating empty output tensors ab12 and c")
+    m, k, l = a_tensor.shape
+    n, k, l = b_tensor.shape
+    ab12_tensor, c_tensor = None, None
+    if c_major == "m":
+        ab12_tensor = torch.empty_strided((m, n, l), (1, m, m * n), dtype=ab12_dtype, device=a_tensor.device)
+        c_tensor = torch.empty_strided((m, n // 2, l), (1, m, m * n // 2), dtype=c_dtype, device=a_tensor.device)
+    elif c_major == "n":
+        ab12_tensor = torch.empty_strided((m, n, l), (n, 1, m * n), dtype=ab12_dtype, device=a_tensor.device)
+        c_tensor = torch.empty_strided(
+            (m, n // 2, l),
+            (n // 2, 1, m * n // 2),
+            dtype=c_dtype,
+            device=a_tensor.device,
+        )
+    else:
+        raise ValueError(f"c_major must be either 'm' or 'n', got {c_major}")
+
+    sfc_tensor, amax_tensor = None, None
+    if sfa_tensor is not None and sfb_tensor is not None:
+        _logger.debug("gemm_swiglu_wrapper_sm100: Detected sfa_tensor and sfb_tensor, constructing quantized output tensors")
+        if c_dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+            _logger.debug("gemm_swiglu_wrapper_sm100: Detected fp8 c_dtype, constructing sfc_tensor")
+
+            sf_k = ceil_div(n // 2, sf_vec_size)
+            mma_shape = (
+                l,
+                ceil_div(m, 128),
+                ceil_div(sf_k, 4),
+                32,
+                4,
+                4,
+            )
+            mma_permute_order = (3, 4, 1, 5, 2, 0)
+            sfc_tensor = torch.empty(
+                mma_shape,
+                dtype=torch.float8_e8m0fnu,
+                device=a_tensor.device,
+            ).permute(mma_permute_order)
+        if a_tensor.dtype in {torch.float4_e2m1fn_x2, torch.uint8} and c_dtype == torch.bfloat16:
+            _logger.debug("gemm_swiglu_wrapper_sm100: Detected fp4 ab_dtype and bf16 c_dtype, constructing amax_tensor")
+            amax_tensor = torch.full((1, 1, 1), -float("inf"), device=a_tensor.device, dtype=torch.float32)
+
+    cache_key = (
+        a_tensor.shape,
+        b_tensor.shape,
+        a_tensor.dtype,
+        b_tensor.dtype,
+        a_tensor.stride(),
+        b_tensor.stride(),
+        alpha,
+        c_major,
+        ab12_dtype,
+        c_dtype,
+        acc_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sfa_tensor.shape if sfa_tensor is not None else None,
+        sfb_tensor.shape if sfb_tensor is not None else None,
+        sfa_tensor.stride() if sfa_tensor is not None else None,
+        sfb_tensor.stride() if sfb_tensor is not None else None,
+        sfa_tensor.dtype if sfa_tensor is not None else None,
+        sfb_tensor.dtype if sfb_tensor is not None else None,
+        norm_const_tensor.shape if norm_const_tensor is not None else None,
+        norm_const_tensor.stride() if norm_const_tensor is not None else None,
+        norm_const_tensor.dtype if norm_const_tensor is not None else None,
+        sf_vec_size,
+        vector_f32,
+        ab12_stages,
+    )
+    if cache_key in _cache_of_GemmSwigluSm100Objects:
+        _logger.debug("gemm_swiglu_wrapper_sm100: Using previously cached GemmSwigluSm100 object")
+        gemm_swiglu = _cache_of_GemmSwigluSm100Objects[cache_key]
+        gemm_swiglu.execute(
+            a_tensor=a_tensor,
+            b_tensor=b_tensor,
+            ab12_tensor=ab12_tensor,
+            c_tensor=c_tensor,
+            sfa_tensor=sfa_tensor,
+            sfb_tensor=sfb_tensor,
+            amax_tensor=amax_tensor,
+            sfc_tensor=sfc_tensor,
+            norm_const_tensor=norm_const_tensor,
+            alpha=alpha,
+            current_stream=stream,
+        )
+    else:
+        _logger.debug("gemm_swiglu_wrapper_sm100: No previously cached GemmSwigluSm100 object found, creating new GemmSwigluSm100 object")
+        gemm_swiglu = GemmSwigluSm100(
+            sample_a=a_tensor,
+            sample_b=b_tensor,
+            sample_ab12=ab12_tensor,
+            sample_c=c_tensor,
+            alpha=alpha,
+            acc_dtype=acc_dtype,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            sample_sfa=sfa_tensor,
+            sample_sfb=sfb_tensor,
+            sample_amax=amax_tensor,
+            sample_sfc=sfc_tensor,
+            sample_norm_const=norm_const_tensor,
+            sf_vec_size=sf_vec_size,
+            vector_f32=vector_f32,
+            ab12_stages=ab12_stages,
+        )
+        assert gemm_swiglu.check_support(), "Unsupported testcase"
+        gemm_swiglu.compile(current_stream=stream)
+        gemm_swiglu.execute(
+            a_tensor=a_tensor,
+            b_tensor=b_tensor,
+            ab12_tensor=ab12_tensor,
+            c_tensor=c_tensor,
+            sfa_tensor=sfa_tensor,
+            sfb_tensor=sfb_tensor,
+            amax_tensor=amax_tensor,
+            sfc_tensor=sfc_tensor,
+            norm_const_tensor=norm_const_tensor,
+            alpha=alpha,
+            current_stream=stream,
+        )
+        _cache_of_GemmSwigluSm100Objects[cache_key] = gemm_swiglu
+
+    if sfa_tensor is not None and sfb_tensor is not None:
+        return ab12_tensor, c_tensor, sfc_tensor, amax_tensor
+    else:
+        return ab12_tensor, c_tensor
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/dense_blockscaled_gemm_persistent_swiglu_interleaved_quant.py b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/dense_blockscaled_gemm_persistent_swiglu_interleaved_quant.py
new file mode 100644
index 00000000..df0093e7
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/dense_blockscaled_gemm_persistent_swiglu_interleaved_quant.py
@@ -0,0 +1,2224 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Type, Tuple, Union, Optional
+
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.pipeline import pipeline_init_arrive, pipeline_init_wait
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import from_dlpack
+
+import cutlass.cute.math as math
+from cutlass.cute.typing import Float32
+
+
+def sigmoid_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
+    """
+    Compute the sigmoid of the input tensor.
+    """
+    return cute.arch.rcp_approx(1.0 + math.exp(-a, fastmath=fastmath))
+
+
+def silu_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
+    """
+    Compute the silu of the input tensor.
+    """
+    return a * sigmoid_f32(a, fastmath=fastmath)
+
+
+"""
+This example provides an experimental implementation of the SM100 batched dense blockscaled GEMM kernel, please note that the APIs and implementation details related to this kernel may change in future releases.
+
+A high-performance persistent batched dense blockscaled GEMM example for the NVIDIA Blackwell SM100 architecture
+using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
+- Matrix AB12 is MxNxL, L is batch dimension, AB12 can be row-major("N") or column-major("M")
+- Matrix C is Mx(N/2)xL, L is batch dimension. Layout must match AB12.
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, which has M×ceil_div(K, sf_vec_size)×L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, which has N×ceil_div(K, sf_vec_size)×L elements respectively
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp:
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Type convert C matrix to output type.
+    - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
+      or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
+    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+Constraints:
+* Supported input data types: mxf8, mxf4, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 64/128/192/256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+"""
+
+
+class Sm100BlockScaledPersistentDenseGemmKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
+        ...     sf_vec_size=16,
+        ...     mma_tiler_mn=(256, 128),
+        ...     cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        vector_f32: bool,
+        ab12_stages: int,
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator, always set to Float32
+            - sf_vec_size: Scalefactor A/B vector size.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param sf_vec_size: Scalefactor vector size.
+        :type sf_vec_size: int
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param ab12_stages: Number of AB12 stages. If None, will be computed automatically.
+        :type ab12_stages: Optional[int]
+        """
+
+        self.acc_dtype = cutlass.Float32
+        self.sf_vec_size = sf_vec_size
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        self.ab12_stages = ab12_stages
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len((self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id))
+        # Set barrier id for epilogue sync and tmem ptr sync
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        # Use vector f32 instructions for epilogue
+        self.vector_f32 = vector_f32
+
+        # Amax reduction configuration
+        self.num_epilog_warps = len(self.epilog_warp_id)
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B/SFA/SFB
+        - Computing epilogue subtile
+        - Setting up A/B/SFA/SFB/C stage counts in shared memory
+        - Computing A/B/SFA/SFB/C shared memory layout
+        """
+        # Compute mma instruction shapes
+        # (MMA_Tile_Shape_M, MMA_Tile_Shape_N, MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        self.mma_tiler_c = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1] // 2,
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk_c = (
+            self.mma_tiler_c[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_c[1],
+            self.mma_tiler_c[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.num_mcast_ctas_sfb = cute.size(self.cluster_layout_sfb_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+        self.is_sfb_mcast = self.num_mcast_ctas_sfb > 1
+
+        # Always use subtile (128,32)
+        self.epi_tile = (cute.make_layout(128), cute.make_layout(32))
+        self.epi_tile_cnt = (
+            self.cta_tile_shape_mnk[0] // cute.size(self.epi_tile[0]),
+            self.cta_tile_shape_mnk[1] // cute.size(self.epi_tile[1]),
+        )
+        # Always use subtile (128,64) for AB12
+        self.epi_tile_ab12 = (cute.make_layout(128), cute.make_layout(64))
+
+        # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_c_stage, self.num_ab12_stage = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.ab12_dtype,
+            self.ab12_layout,
+            self.epi_tile_ab12,
+            self.smem_capacity,
+            self.occupancy,
+            self.ab12_stages,
+        )
+
+        # Compute A/B/SFA/SFB/C shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+        self.ab12_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.ab12_dtype,
+            self.ab12_layout,
+            self.epi_tile_ab12,
+            self.num_ab12_stage,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_tensor: cute.Tensor,
+        b_tensor: cute.Tensor,
+        sfa_tensor: cute.Tensor,
+        sfb_tensor: cute.Tensor,
+        c_tensor: cute.Tensor,
+        ab12_tensor: cute.Tensor,
+        amax_tensor: Optional[cute.Tensor],
+        sfc_tensor: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        alpha: cutlass.Float32,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a_tensor: Input tensor A
+        :type a_tensor: cute.Tensor
+        :param b_tensor: Input tensor B
+        :type b_tensor: cute.Tensor
+        :param sfa_tensor: Scale factor tensor A
+        :type sfa_tensor: cute.Tensor
+        :param sfb_tensor: Scale factor tensor B
+        :type sfb_tensor: cute.Tensor
+        :param c_tensor: Output tensor C
+        :type c_tensor: cute.Tensor
+        :param ab12_tensor: Input tensor AB12
+        :type ab12_tensor: cute.Tensor
+        :param sfc_tensor: Scale factor tensor C
+        :type sfc_tensor: cute.Tensor
+        :param norm_const_tensor: Normalization constant tensor for quantization
+        :type norm_const_tensor: cute.Tensor
+        :param amax_tensor: Output tensor for absolute maximum value
+        :type amax_tensor: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a_tensor.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b_tensor.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa_tensor.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c_tensor.element_type
+        self.ab12_dtype: Type[cutlass.Numeric] = ab12_tensor.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a_tensor).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b_tensor).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c_tensor)
+        self.ab12_layout = utils.LayoutEnum.from_tensor(ab12_tensor)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a_tensor.shape, self.sf_vec_size)
+        sfa_tensor = cute.make_tensor(sfa_tensor.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b_tensor.shape, self.sf_vec_size)
+        sfb_tensor = cute.make_tensor(sfb_tensor.iterator, sfb_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs. # {$nv-internal-release}
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a_tensor,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b_tensor,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa_tensor,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb_tensor,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        if cutlass.const_expr(self.cta_tile_shape_mnk_c[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout)
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c_tensor,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+        epi_ab12_smem_layout = cute.slice_(self.ab12_smem_layout_staged, (None, None, 0))
+        tma_atom_ab12, tma_tensor_ab12 = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            ab12_tensor,
+            epi_ab12_smem_layout,
+            self.epi_tile_ab12,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c_tensor,
+            self.cta_tile_shape_mnk_c,
+            self.cluster_shape_mn,
+            max_active_clusters,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        self.generate_sfc = sfc_tensor is not None and norm_const_tensor is not None
+        if cutlass.const_expr(self.generate_sfc):
+            sfc_layout = blockscaled_utils.tile_atom_to_shape_SF(c_tensor.shape, self.sf_vec_size)
+            sfc_tensor = cute.make_tensor(sfc_tensor.iterator, sfc_layout)
+
+        self.generate_amax = amax_tensor is not None
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            sAB12: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.ab12_dtype,
+                    cute.cosize(self.ab12_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # Amax reduction shared memory (one FP32 per epilogue warp)
+            # Use smaller alignment for amax since it's only 16 bytes
+            sAmax: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, self.num_epilog_warps],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            tma_atom_ab12,
+            tma_tensor_ab12,
+            amax_tensor,
+            sfc_tensor,
+            norm_const_tensor,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.ab12_smem_layout_staged,
+            self.epi_tile,
+            self.epi_tile_ab12,
+            self.tile_sched_params,
+            epilogue_op,
+            alpha,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            stream=stream,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        tma_atom_ab12: cute.CopyAtom,
+        mAB12_mnl: cute.Tensor,
+        mAmax_tensor: Optional[cute.Tensor],
+        mSFC_mnl: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        ab12_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout],
+        epi_tile: cute.Tile,
+        epi_tile_ab12: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+        alpha: cutlass.Float32,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+            cpasync.prefetch_descriptor(tma_atom_ab12)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(cta_rank_in_cluster)
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_tma_producer)
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+            defer_sync=True,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_acc_consumer_threads)
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+            defer_sync=True,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        pipeline_init_arrive(cluster_shape_mn=self.cluster_shape_mn, is_relaxed=True)
+
+        #
+        # Setup smem tensor A/B/SFA/SFB/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(
+            c_smem_layout_staged.outer,
+            swizzle=c_smem_layout_staged.inner,
+            dtype=self.c_dtype,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sAB12 = storage.sAB12.get_tensor(
+            ab12_smem_layout_staged.outer,
+            swizzle=ab12_smem_layout_staged.inner,
+            dtype=self.ab12_dtype,
+        )
+
+        # Shared memory for amax reduction (one FP32 per epilogue warp)
+        # Simple 1D layout. The allocation always here if no amax is generated,
+        # as the overhead is minimal and we want to keep the code simple.
+        amax_layout = cute.make_layout((self.num_epilog_warps,))
+        sAmax = storage.sAmax.get_tensor(amax_layout)
+
+        #
+        # Compute multicast mask for A/B/SFA/SFB buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1)
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1)
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, RestM, RestK, RestL)
+        gA_mkl = cute.local_tile(mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+        # (bN, bK, RestN, RestK, RestL)
+        gB_nkl = cute.local_tile(mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None))
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+        # (bM, bN, RestM, RestN, RestL)
+        gC_mnl = cute.local_tile(mC_mnl, cute.slice_(self.mma_tiler_c, (None, None, 0)), (None, None, None))
+        # (bM, bN, RestM, RestN, RestL)
+        gAB12_mnl = cute.local_tile(
+            mAB12_mnl,
+            cute.slice_(self.mma_tiler, (None, None, 0)),
+            (None, None, None),
+        )
+        k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+        # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+        tCgAB12 = thr_mma.partition_C(gAB12_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        pipeline_init_wait(cluster_shape_mn=self.cluster_shape_mn)
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_ab_stage)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), RestK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                # ((atom_v, rest_v), RestK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+
+                # Apply SFB slicing hack when cta_tile_shape_n=64 # {$nv-internal-release}
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B/SFA/SFB
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_slice[(None, ab_producer_state.count)],
+                        tAsSFA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_slice[(None, ab_producer_state.count)],
+                        tBsSFB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator/SFA/SFB tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # Make accumulator tmem tensor
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+            #
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_ab_stage)
+            acc_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_acc_stage)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                # Set tensor memory buffer for current tile
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
+
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+
+                # Apply TMEM pointer offset hack when cta_tile_shape_n=192 or cta_tile_shape_n=64 # {$nv-internal-release}
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for cta_tile_shape_n=192 case by two words (ignores first 64 columns of SFB)
+                    offset = cutlass.Int32(2) if mma_tile_coord_mnl[1] % 2 == 1 else cutlass.Int32(0)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + tcgen05.find_tmem_tensor_col_offset(tCtSFA) + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base) + tcgen05.find_tmem_tensor_col_offset(tCtSFA) + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_tile in range(k_tile_cnt):
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc_up,
+                tTR_rAcc_gate,
+            ) = self.epilog_tmem_copy_and_partition(epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs)
+
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(tiled_copy_t2r, tTR_rC, epi_tidx, sC)
+            (
+                bSG_sC,
+                bSG_gC_mnl,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, epi_tile, sC)
+            tTR_rAB12 = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.ab12_dtype)
+            _, tRS_rAB12, tRS_sAB12 = self.epilog_smem_copy_and_partition(tiled_copy_t2r, tTR_rAB12, epi_tidx, sAB12)
+            (
+                bSG_sAB12,
+                bSG_gAB12_mnl,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_ab12, tCgAB12, epi_tile_ab12, sAB12)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_acc_stage)
+
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+            ab12_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            ab12_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_ab12_stage,
+                producer_group=ab12_producer_group,
+            )
+
+            # norm_const when used in sfc
+            if cutlass.const_expr(self.generate_sfc):
+                norm_const = norm_const_tensor[0]
+                regPerSubtile = 4
+                sfc_tile = (cute.make_layout(128), cute.make_layout(32 * regPerSubtile))
+                # (EPI_TILE_M, EPI_TILE_N, RestM, RestN, RestL)
+                gSFC_mnl = cute.local_tile(mSFC_mnl, sfc_tile, (None, None, None))
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                tCgSFC_mnl = thr_copy_t2r.partition_D(gSFC_mnl)
+                tCgSFC_mnl = cute.filter_zeros(tCgSFC_mnl)
+                # ((T2R, T2R_M, T2R_N), SUBTILLE_IDX) # ((1,1),1,4):((0,0),0,1)
+                tCrSFC = cute.make_rmem_tensor(tCgSFC_mnl[(None, None, None, 0, 0, 0)].layout, self.sf_dtype)
+                tCrSFC_pvscale = cute.make_rmem_tensor_like(tCrSFC, cutlass.Float32)
+                tCrSFC_qpvscale_up_fp32 = cute.make_rmem_tensor_like(tCrSFC, cutlass.Float32)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_mnl[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+                bSG_gAB12 = bSG_gAB12_mnl[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_consumer_state.index)]
+
+                # Initialize thread-local amax accumulator for this tile
+                # Use 0.0 as initial value since we're computing absolute maximum
+                if cutlass.const_expr(self.generate_amax):
+                    thread_tile_amax = cutlass.Float32(0.0)
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+                bSG_gAB12 = cute.group_modes(bSG_gAB12, 1, cute.rank(bSG_gAB12))
+
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])  ## tTR_tAcc.shape: (((32, 32), 1), 1, 1, (1, 8))
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+                for subtile_idx in cutlass.range(0, subtile_cnt, 2):
+                    # Calculate subtile index for C output (one output per two input subtiles)
+                    sfc_subtile_idx = subtile_idx // 2
+
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn_up = tTR_tAcc[(None, None, None, subtile_idx)]
+                    tTR_tAcc_mn_gate = tTR_tAcc[(None, None, None, subtile_idx + 1)]
+
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_up, tTR_rAcc_up)
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_gate, tTR_rAcc_gate)
+
+                    #
+                    # Convert to C type
+                    #
+                    acc_vec_up = tiled_copy_r2s.retile(tTR_rAcc_up)
+                    acc_vec_gate = tiled_copy_r2s.retile(tTR_rAcc_gate)
+
+                    #
+                    # Apply alpha
+                    #
+                    acc_vec_up_ = acc_vec_up.load()
+                    acc_vec_gate_ = acc_vec_gate.load()
+                    acc_vec_up_ = acc_vec_up_ * alpha
+                    acc_vec_gate_ = acc_vec_gate_ * alpha
+                    acc_vec_up.store(acc_vec_up_)
+                    acc_vec_gate.store(acc_vec_gate_)
+
+                    #
+                    # Store AB12 to shared memory for bprop
+                    #
+                    AB12_buffer = (num_prev_subtiles + sfc_subtile_idx) % self.num_ab12_stage
+                    # Convert to ab12_dtype before storing
+                    # Load, convert type, and store back to temporary register tensor
+                    # tTR_rAB12_up = cute.make_rmem_tensor(tTR_tAcc_mn_up.shape, self.ab12_dtype)
+                    # tTR_rAB12_gate = cute.make_rmem_tensor(tTR_tAcc_mn_gate.shape, self.ab12_dtype)
+                    tRS_rAB12.store(acc_vec_up.load().to(self.ab12_dtype))
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rAB12[(None, None, 0)],
+                        tRS_sAB12[(None, None, 0, AB12_buffer)],
+                    )
+                    tRS_rAB12.store(acc_vec_gate_.to(self.ab12_dtype))
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rAB12[(None, None, 0)],  # ((1, 32), 1, 1), ((0, 1), 0, 0)
+                        tRS_sAB12[(None, None, 1, AB12_buffer)],  # ((1, 32), 1, 2, (1, 1)), ((0, 1), 0, 32, (0, 0))
+                    )
+
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store AB12 to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_ab12,
+                            bSG_sAB12[(None, sfc_subtile_idx % self.num_ab12_stage)],  # ((8192, 1), (1, 4)), ((1, 0), (0, 8192))
+                            bSG_gAB12[(None, sfc_subtile_idx)],  # (((64, 128), 1), (1, 4)) : (((1@0,1@1),0),(0,64@0))
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        ab12_pipeline.producer_commit()
+                        ab12_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                    # SwiGelu
+                    tCompute = cute.make_rmem_tensor(acc_vec_gate.shape, self.acc_dtype)
+                    if cutlass.const_expr(self.vector_f32):
+                        # SwiGelu Packed Version
+                        LOG2_E = cutlass.Float32(1.4426950408889634)
+                        for i in cutlass.range(0, cute.size(acc_vec_gate), 2, unroll_full=True):
+                            tCompute_log2e = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                                (-LOG2_E, -LOG2_E),
+                            )
+                            ## replace to add_packed_f32x2 when no precision issue.
+                            tCompute[i + 0] = cute.math.exp2(tCompute_log2e[0], fastmath=True) + 1.0
+                            tCompute[i + 1] = cute.math.exp2(tCompute_log2e[1], fastmath=True) + 1.0
+                            tCompute[i + 0] = cute.arch.rcp_approx(tCompute[i + 0])
+                            tCompute[i + 1] = cute.arch.rcp_approx(tCompute[i + 1])
+                            (
+                                tCompute[i + 0],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                            )
+                            (
+                                tCompute[i + 0],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_up[i], acc_vec_up[i + 1]),
+                            )
+                    else:
+                        # SwiGelu Unpacked Version
+                        for i in cutlass.range(0, cute.size(acc_vec_gate), 1, unroll_full=True):
+                            tCompute[i] = acc_vec_up[i] * silu_f32(acc_vec_gate[i], fastmath=True)
+
+                    #
+                    # Generate amax
+                    #
+                    if cutlass.const_expr(self.generate_amax):
+                        acc_values = tCompute.load()
+                        # Apply element-wise absolute value using math.absf (supports vectors)
+                        abs_acc_values_ir = cutlass._mlir.dialects.math.absf(acc_values.ir_value())  # operand (positional)
+                        abs_acc_values = type(acc_values)(abs_acc_values_ir, acc_values.shape, acc_values.dtype)
+                        subtile_amax = abs_acc_values.reduce(
+                            cute.ReductionOp.MAX,
+                            cutlass.Float32(0.0),
+                            0,  # Use 0.0 as init for abs values
+                        )
+                        thread_tile_amax = cute.arch.fmax(thread_tile_amax, subtile_amax)
+
+                    #
+                    # Generate sfc
+                    #
+                    if cutlass.const_expr(self.generate_sfc):
+                        #
+                        # generate sfc, and store to shared memory
+                        #
+                        # tCgSFC_mnl: 256x512 => ((1,1),1,4,2,2,(1,1)):((0,0),0,1,1024,512,(0,2048))
+                        tCgSFC = tCgSFC_mnl[
+                            (
+                                None,
+                                None,
+                                None,
+                                *cur_tile_coord,
+                            )
+                        ]
+
+                        #
+                        # Get absolute max across a vector and Compute SFC
+                        #
+                        tTR_rAcc_frg = cute.logical_divide(tCompute, cute.make_layout(self.sf_vec_size))
+                        acc_frg = tTR_rAcc_frg.load()
+                        acc_frg = epilogue_op(acc_frg)
+
+                        # Apply element-wise absolute value using math.absf (supports vectors)
+                        abs_acc_frg_ir = cutlass._mlir.dialects.math.absf(acc_frg.ir_value())
+                        abs_acc_frg = type(acc_frg)(abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype)
+                        if cutlass.const_expr(self.vector_f32):
+                            tCrSFC_pvscale_subtile = tCrSFC_pvscale[None, None, sfc_subtile_idx]
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale_subtile[vi] = abs_acc_frg[None, vi].reduce(
+                                    cute.ReductionOp.MAX,
+                                    cutlass.Float32(0.0),
+                                    0,  # Use 0.0 as init for abs values
+                                )
+                            for vi in cutlass.range_constexpr(0, abs_acc_frg.shape[1], 2):
+                                (
+                                    tCrSFC_pvscale_subtile[vi],
+                                    tCrSFC_pvscale_subtile[vi + 1],
+                                ) = cute.arch.mul_packed_f32x2(
+                                    (
+                                        tCrSFC_pvscale_subtile[vi],
+                                        tCrSFC_pvscale_subtile[vi + 1],
+                                    ),
+                                    (
+                                        self.get_dtype_rcp_limits(self.c_dtype),
+                                        self.get_dtype_rcp_limits(self.c_dtype),
+                                    ),
+                                )
+                                (
+                                    tCrSFC_pvscale_subtile[vi],
+                                    tCrSFC_pvscale_subtile[vi + 1],
+                                ) = cute.arch.mul_packed_f32x2(
+                                    (
+                                        tCrSFC_pvscale_subtile[vi],
+                                        tCrSFC_pvscale_subtile[vi + 1],
+                                    ),
+                                    (norm_const, norm_const),
+                                )
+                        else:
+                            for vi in cutlass.range_constexpr(abs_acc_frg.shape[1]):
+                                tCrSFC_pvscale[None, None, sfc_subtile_idx][vi] = (
+                                    abs_acc_frg[None, vi].reduce(
+                                        cute.ReductionOp.MAX,
+                                        cutlass.Float32(0.0),
+                                        0,  # Use 0.0 as init for abs values
+                                    )
+                                    * self.get_dtype_rcp_limits(self.c_dtype)
+                                    * norm_const
+                                )
+
+                        # TODO: need to investigate f32x2 -> f8x2 conversion
+                        if sfc_subtile_idx == 3:  # 3 is the last subtile to composite SFC to a reg
+                            #
+                            # convert SFC from fp32 to sf_dtype
+                            #
+                            tCrSFC.store(tCrSFC_pvscale.load().to(self.sf_dtype))
+                            #
+                            # Store SFC to global memory
+                            #
+                            # TODO: Need to think about predicate on it
+                            # Current no need as worload is a multiple of 32
+                            # if cute.elem_less():
+                            cute.autovec_copy(tCrSFC, tCgSFC)
+
+                        #
+                        # Compute quantized output values and convert to C type
+                        #
+                        # TODO: need to investigate f8x2 -> f32x2 conversion
+                        # TODO: check E8M0 upcast fast mode
+                        ## tCrSFC_qpvscale_up = tCrSFC_pvscale[None, None, sfc_subtile_idx]
+                        tCrSFC.store(tCrSFC_pvscale.load().to(self.sf_dtype))
+                        tCrSFC_qpvscale_up_fp32.store(tCrSFC.load().to(cutlass.Float32))
+                        tCrSFC_qpvscale_up = tCrSFC_qpvscale_up_fp32[None, None, sfc_subtile_idx]
+
+                        fp32_max = cutlass.Float32(3.40282346638528859812e38)
+                        if cutlass.const_expr(self.vector_f32):
+                            for vi in cutlass.range_constexpr(0, cute.size(tCrSFC_qpvscale_up), 2):
+                                acc_scale = cute.arch.mul_packed_f32x2(
+                                    (
+                                        cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi]),
+                                        cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi + 1]),
+                                    ),
+                                    (norm_const, norm_const),
+                                )
+                                acc_scale_min0 = fmin(acc_scale[0], fp32_max, nan=True)
+                                acc_scale_min1 = fmin(acc_scale[1], fp32_max, nan=True)
+
+                                vec0 = tTR_rAcc_frg[None, vi]
+                                vec1 = tTR_rAcc_frg[None, vi + 1]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec0[ei], vec1[ei] = cute.arch.mul_packed_f32x2(
+                                        (vec0[ei], vec1[ei]),
+                                        (acc_scale_min0, acc_scale_min1),
+                                    )
+                        else:
+                            for vi in cutlass.range_constexpr(cute.size(tCrSFC_qpvscale_up)):
+                                acc_scale = norm_const * cute.arch.rcp_approx(tCrSFC_qpvscale_up[vi])
+                                acc_scale = fmin(acc_scale, fp32_max, nan=True)
+
+                                vec = tTR_rAcc_frg[None, vi]
+                                for ei in cutlass.range_constexpr(self.sf_vec_size):
+                                    vec[ei] = vec[ei] * acc_scale
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        tRS_rC.store(acc_vec.to(self.c_dtype))
+                    else:
+                        #
+                        # Convert to C type
+                        #
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        acc_vec = epilogue_op(acc_vec.to(self.c_dtype))
+                        tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    c_buffer = (num_prev_subtiles + sfc_subtile_idx) % self.num_c_stage
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, sfc_subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                # Perform amax reduction after all subtiles are processed
+                if cutlass.const_expr(self.generate_amax):
+                    warp_amax = cute.arch.warp_reduction_max(
+                        thread_tile_amax,
+                        threads_in_group=32,
+                    )
+                    # Each epilogue warp's lane 0 writes warp amax to shared memory
+                    if cute.arch.lane_idx() == 0:
+                        sAmax[warp_idx] = cutlass.Float32(warp_amax)
+
+                    # Ensure all epilogue warps complete their writes before block reduction
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                    # Block-level reduction: only first epilogue warp's lane 0 handles this
+                    if warp_idx == self.epilog_warp_id[0] and cute.arch.lane_idx() == 0:
+                        block_amax = cutlass.Float32(0.0)  # Initial value for absolute maximum
+                        for i in cutlass.range(self.num_epilog_warps):
+                            warp_amax_val = sAmax[i]
+                            block_amax = cute.arch.fmax(block_amax, warp_amax_val)
+
+                        # Global atomic max (accumulates across all tiles for final tensor amax)
+                        # Since we compute absolute values, all values are non-negative
+                        _value_int = cutlass._mlir.dialects.llvm.bitcast(
+                            cutlass.cutlass_dsl.T.i32(),
+                            block_amax.ir_value(),
+                            loc=None,
+                            ip=None,
+                        )
+                        _old_value_int = cutlass._mlir.dialects.nvvm.atomicrmw(
+                            res=cutlass.cutlass_dsl.T.i32(),
+                            op=cutlass._mlir.dialects.nvvm.AtomicOpKind.MAX,
+                            ptr=mAmax_tensor.iterator.llvm_ptr,
+                            a=_value_int,
+                            loc=None,
+                            ip=None,
+                        )
+                        _ = cutlass.Float32(
+                            cutlass._mlir.dialects.llvm.bitcast(
+                                cutlass.cutlass_dsl.T.f32(),
+                                _old_value_int,
+                                loc=None,
+                                ip=None,
+                            )
+                        )
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    acc_pipeline.consumer_release(acc_consumer_state)
+                acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(acc_tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+            ab12_pipeline.producer_tail()
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc_up: The partitioned accumulator tensor for acc up
+            - tTR_rAcc_gate: The partitioned accumulator tensor for acc gate
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_mnl_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_up = cute.make_rmem_tensor(tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_gate = cute.make_rmem_tensor(tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r)
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
+            - tma_atom_c: The TMA copy atom
+            - bSG_sC: The partitioned shared memory tensor C
+            - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, RestM, RestN, RestL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            atom,
+            0,
+            cute.make_layout(1),
+            cute.group_modes(sC, 0, 2),
+            cute.group_modes(gC_epi, 0, 2),
+        )
+        return bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        ab12_dtype: Type[cutlass.Numeric],
+        ab12_layout: utils.LayoutEnum,
+        epi_tile_ab12: cute.Tile,
+        smem_capacity: int,
+        occupancy: int,
+        ab12_stages: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of Scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Scale factor vector size.
+        :type sf_vec_size: int
+        :param ab12_dtype: Data type of operand AB12.
+        :type ab12_dtype: type[cutlass.Numeric]
+        :param ab12_layout: Layout enum of operand AB12.
+        :type ab12_layout: utils.LayoutEnum
+        :param epi_tile_ab12: The epilogue tile shape for AB12.
+        :type epi_tile_ab12: cute.Tile
+        :param smem_capacity: Total available shared memory capacity in bytes.
+        :type smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C stages
+        num_c_stage = mma_tiler_mnk[1] // cute.cosize(epi_tile[1]) // 2
+        # Use override value if provided, otherwise compute from tile sizes
+        if ab12_stages is not None and ab12_stages > 0:
+            num_ab12_stage = ab12_stages
+        else:
+            num_ab12_stage = mma_tiler_mnk[1] // cute.cosize(epi_tile_ab12[1])
+
+        # Calculate smem layout and size for one stage of A, B, SFA, SFB and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab12_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            ab12_dtype,
+            ab12_layout,
+            epi_tile_ab12,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        ab12_bytes_per_stage = cute.size_in_bytes(ab12_dtype, ab12_smem_layout_staged_one)
+        amax_bytes = Sm100BlockScaledPersistentDenseGemmKernel.get_amax_smem_size()
+        epi_bytes = c_bytes_per_stage * num_c_stage + ab12_bytes_per_stage * num_ab12_stage + amax_bytes
+
+        # Calculate A/B/SFA/SFB stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B/SFA/SFB stage
+        num_ab_stage = (smem_capacity // occupancy - (mbar_helpers_bytes + epi_bytes)) // ab_bytes_per_stage
+
+        return num_acc_stage, num_ab_stage, num_c_stage, num_ab12_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl)
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(tile_sched_params, max_active_clusters)
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def get_dtype_rcp_limits(dtype: Type[cutlass.Numeric]) -> float:
+        """
+        Calculates the reciprocal of the maximum absolute value for a given data type.
+
+        :param dtype: Data type
+        :type dtype: Type[cutlass.Numeric]
+
+        :return: An float representing the reciprocal of the maximum absolute value
+        :rtype: float
+        """
+        if dtype == cutlass.Float4E2M1FN:
+            return 1 / 6.0
+        if dtype == cutlass.Float8E4M3FN:
+            return 1 / 448.0
+        if dtype == cutlass.Float8E5M2:
+            return 1 / 128.0
+        return 1.0
+
+    @staticmethod
+    def get_amax_smem_size():
+        # Note: 4 is hardcoded for num_epilog_warps
+        return 4 * cute.size_in_bytes(cutlass.Float32, cute.make_layout((1,)))
+
+
+class Sm100BlockScaledPersistentDenseGemmKernelNoDlpack:
+    """Wrapper around Sm100BlockScaledPersistentDenseGemmKernel that avoids DLPack.
+
+    This wrapper constructs cute.Tensors directly from cute.Pointer, shapes, and
+    explicit layout orders for operands A, B, SFA, SFB, C, AB12, and optionally
+    amax, sfc, and norm_const.
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        vector_f32: bool,
+        ab12_stages: int,
+    ):
+        self.kernel = Sm100BlockScaledPersistentDenseGemmKernel(
+            sf_vec_size=sf_vec_size,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            vector_f32=vector_f32,
+            ab12_stages=ab12_stages,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_ptr: cute.Pointer,
+        a_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        a_order: cutlass.Constexpr[Tuple[int, int, int]],
+        b_ptr: cute.Pointer,
+        b_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        b_order: cutlass.Constexpr[Tuple[int, int, int]],
+        sfa_ptr: cute.Pointer,
+        sfa_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfa_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfb_ptr: cute.Pointer,
+        sfb_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfb_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        c_ptr: cute.Pointer,
+        c_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        c_order: cutlass.Constexpr[Tuple[int, int, int]],
+        ab12_ptr: cute.Pointer,
+        ab12_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        ab12_order: cutlass.Constexpr[Tuple[int, int, int]],
+        amax_ptr: Optional[cute.Pointer],
+        amax_shape: cutlass.Constexpr[Tuple[int]],
+        amax_order: cutlass.Constexpr[Tuple[int]],
+        sfc_ptr: Optional[cute.Pointer],
+        sfc_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfc_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        norm_const_ptr: Optional[cute.Pointer],
+        norm_const_shape: cutlass.Constexpr[Tuple[int]],
+        norm_const_order: cutlass.Constexpr[Tuple[int]],
+        alpha: cutlass.Float32,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        a_cute = cute.make_tensor(a_ptr, layout=cute.make_ordered_layout(a_shape, order=a_order))
+        b_cute = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout(b_shape, order=b_order))
+        sfa_cute = cute.make_tensor(sfa_ptr, layout=cute.make_ordered_layout(sfa_shape, order=sfa_order))
+        sfb_cute = cute.make_tensor(sfb_ptr, layout=cute.make_ordered_layout(sfb_shape, order=sfb_order))
+        c_cute = cute.make_tensor(c_ptr, layout=cute.make_ordered_layout(c_shape, order=c_order))
+        ab12_cute = cute.make_tensor(ab12_ptr, layout=cute.make_ordered_layout(ab12_shape, order=ab12_order))
+
+        amax_cute = None
+        if cutlass.const_expr(amax_ptr is not None):
+            amax_cute = cute.make_tensor(amax_ptr, layout=cute.make_ordered_layout(amax_shape, order=amax_order))
+
+        sfc_cute = None
+        if cutlass.const_expr(sfc_ptr is not None):
+            sfc_cute = cute.make_tensor(sfc_ptr, layout=cute.make_ordered_layout(sfc_shape, order=sfc_order))
+
+        norm_const_cute = None
+        if cutlass.const_expr(norm_const_ptr is not None):
+            norm_const_cute = cute.make_tensor(
+                norm_const_ptr,
+                layout=cute.make_ordered_layout(norm_const_shape, order=norm_const_order),
+            )
+
+        self.kernel(
+            a_cute,
+            b_cute,
+            sfa_cute,
+            sfb_cute,
+            c_cute,
+            ab12_cute,
+            amax_cute,
+            sfc_cute,
+            norm_const_cute,
+            alpha,
+            max_active_clusters,
+            stream,
+            epilogue_op,
+        )
+
+
+def fmin(
+    a: Union[float, cutlass.Float32],
+    b: Union[float, cutlass.Float32],
+    *,
+    loc=None,
+    ip=None,
+    nan=True,
+) -> cutlass.Float32:
+    ptx_instr = f"min.f32 $0, $1, $2;"
+    return cutlass.Float32(
+        cutlass._mlir.dialects.llvm.inline_asm(
+            cutlass.cutlass_dsl.T.f32(),
+            [
+                cutlass.Float32(a).ir_value(loc=loc, ip=ip),
+                cutlass.Float32(b).ir_value(loc=loc, ip=ip),
+            ],
+            f"{ptx_instr}",
+            f"=f,f,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=cutlass._mlir.dialects.llvm.AsmDialect.AD_ATT,
+        )
+    )
diff --git a/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/dense_gemm_persistent_swiglu.py b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/dense_gemm_persistent_swiglu.py
new file mode 100644
index 00000000..ae9b7f72
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/gemm_swiglu/dense_gemm_persistent_swiglu.py
@@ -0,0 +1,1583 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Optional, Type, Tuple, Union
+
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+import cutlass.cute.testing as testing
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+import cutlass.cute.math as math
+import inspect
+
+# Mathematical constant: log2(e) for converting exp(x) to exp2(x * log2(e))
+LOG2_E = 1.4426950408889634
+
+"""
+A high-performance persistent batched dense GEMM example for the NVIDIA Blackwell SM100 architecture
+using CUTE DSL.
+- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M")
+- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K")
+- Matrix AB12 is MxNxL, L is batch dimension, AB12 can be row-major("N") or column-major("M")
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. MMA warp: Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Type convert C matrix to output type.
+    - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
+      or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
+    - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor:
+      e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0))
+
+SM100 tcgen05.mma instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+
+Constraints are same as dense_gemm.py:
+* Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2),
+  see detailed valid dtype combinations in below PersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type
+* Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
+* Mma tiler N must be 32-256, step 32
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if use_2cta_instrs=True
+* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 4, 8, and 16 for TFloat32,
+  Float16/BFloat16, and Int8/Uint8/Float8, respectively.
+* OOB tiles are not allowed when TMA store is disabled
+"""
+
+
+class PersistentDenseGemmKernel:
+    """This class implements batched matrix multiplication (C = A x B) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param acc_dtype: Data type for accumulation during computation
+    :type acc_dtype: type[cutlass.Numeric]
+    :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation
+    :type use_2cta_instrs: bool
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+    :note: This kernel always uses Tensor Memory Access (TMA) for storing results.
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported A/B data types:
+        - TFloat32
+        - Float16/BFloat16
+        - Int8/Uint8
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Supported accumulator data types:
+        - Float32 (for all floating point A/B data types)
+        - Float16 (only for fp16 and fp8 A/B data types)
+        - Int32 (only for uint8/int8 A/B data types)
+
+    :note: Supported C data types:
+        - Float32 (for float32 and int32 accumulator data types)
+        - Int32 (for float32 and int32 accumulator data types)
+        - Float16/BFloat16 (for fp16 and fp8 accumulator data types)
+        - Int8/Uint8 (for uint8/int8 accumulator data types)
+        - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types)
+
+    :note: Constraints:
+        - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True)
+        - MMA tiler N must be 32-256, step 32
+        - Cluster shape M must be multiple of 2 if use_2cta_instrs=True
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+
+    Example:
+        >>> gemm = PersistentDenseGemmKernel(
+        ...     acc_dtype=cutlass.Float32,
+        ...     use_2cta_instrs=True,
+        ...     mma_tiler_mn=(128, 128),
+        ...     cluster_shape_mn=(2, 2)
+        ... )
+        >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ):
+        """Initializes the configuration for a Blackwell dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
+              with cta_group=2 should be used.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        3. Output C tensor store mode:
+            - TMA store is always enabled for output tensors.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        """
+
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        # Set specialized warp ids
+        self.epilog_warp_id = (
+            0,
+            1,
+            2,
+            3,
+        )
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.threads_per_cta = 32 * len((self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id))
+        # Set barrier id for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_bar_id = 0
+        self.epilog_sync_bar_id = 1
+        self.tmem_ptr_sync_bar_id = 2
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.mma_tiler_c = (
+            self.mma_tiler[0],
+            self.mma_tiler[1] // 2,  # divide by 2 because Glu advnces by half on N dimension
+            self.mma_tiler[2],
+        )
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        self.cta_tile_shape_mnk_c = (
+            self.mma_tiler_c[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_c[1],
+            self.mma_tiler_c[2],
+        )
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.ab12_layout,
+            self.ab12_dtype,
+        )
+        self.epi_tile_c = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk_c,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        # Setup A/B/AB12 stage count in shared memory and ACC stage count in tensor memory
+        self.num_acc_stage, self.num_ab_stage, self.num_ab12_stage, self.num_c_stage = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.epi_tile_c,
+            self.ab12_dtype,
+            self.ab12_layout,
+            self.c_dtype,
+            self.c_layout,
+            self.smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/AB12 shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.ab12_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.ab12_dtype,
+            self.ab12_layout,
+            self.epi_tile,
+            self.num_ab12_stage,
+        )
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile_c,
+            self.num_c_stage,
+        )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols(tiled_mma, self.mma_tiler, self.num_acc_stage)
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        ab12: cute.Tensor,
+        c: cute.Tensor,
+        alpha: cutlass.Float32,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x / (1 + math.exp(-x, True)),
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel asynchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param ab12: Output tensor AB12 (full GEMM result)
+        :type ab12: cute.Tensor
+        :param c: Output tensor C (SwiGLU result)
+        :type c: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.ab12_dtype: Type[cutlass.Numeric] = ab12.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.ab12_layout = utils.LayoutEnum.from_tensor(ab12)
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2],
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=(cutlass.TFloat32 if a.element_type is cutlass.Float32 else None),
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=(cutlass.TFloat32 if b.element_type is cutlass.Float32 else None),
+        )
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size
+
+        # Setup TMA store for AB12 and C
+        ab12_cta_v_layout = cute.composition(cute.make_identity_layout(ab12.shape), self.epi_tile)
+        c_cta_v_layout = cute.composition(cute.make_identity_layout(c.shape), self.epi_tile_c)
+        epi_smem_layout = cute.slice_(self.ab12_smem_layout_staged, (None, None, 0))
+        epi_smem_layout_c = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_ab12, tma_tensor_ab12 = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            ab12,
+            epi_smem_layout,
+            ab12_cta_v_layout,
+        )
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c,
+            epi_smem_layout_c,
+            c_cta_v_layout,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(ab12, self.cta_tile_shape_mnk, self.cluster_shape_mn, max_active_clusters)
+
+        self.buffer_align_bytes = 1024
+
+        ab12_smem_size = cute.cosize(self.ab12_smem_layout_staged.outer)
+        # ab12_smem_size: S<1,4,3> o 0 o ((8,16),(32,1),(1,8)):((32,256),(1,0),(0,4096))
+        c_smem_size = cute.cosize(self.c_smem_layout_staged.outer)
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
+            acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sAB12: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.ab12_dtype,
+                    ab12_smem_size,
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    c_smem_size,
+                ],
+                self.buffer_align_bytes,
+            ]
+
+            # c_smem_size: S<1,4,3> o 0 o ((8,16),(32,1),(1,8)):((32,256),(1,0),(0,4096))
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel asynchronously
+        self.kernel(
+            tiled_mma,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_ab12,
+            tma_atom_c,
+            tma_tensor_ab12,
+            tma_tensor_c,
+            self.cluster_layout_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.ab12_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.epi_tile_c,
+            self.tile_sched_params,
+            epilogue_op,
+            alpha,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+        )
+        return
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_ab12: Optional[cute.CopyAtom],
+        tma_atom_c: Optional[cute.CopyAtom],
+        mAB12_mnl: cute.Tensor,
+        mC_mnl: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        ab12_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        epi_tile_c: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+        alpha: cutlass.Float32,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_ab12)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
+        tmem_holding_buf = storage.tmem_holding_buf
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_tma_producer)
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_acc_consumer_threads)
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Tensor memory dealloc barrier init
+        if use_2cta_instrs:
+            if warp_idx == self.tma_warp_id:
+                num_tmem_dealloc_threads = 32
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init(tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads)
+        cute.arch.mbarrier_init_fence()
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/AB12/C
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sAB12 = storage.sAB12.get_tensor(ab12_smem_layout_staged.outer, swizzle=ab12_smem_layout_staged.inner)
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1)
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, RestM, RestK, RestL)
+        gA_mkl = cute.local_tile(mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+        # (bN, bK, RestN, RestK, RestL)
+        gB_nkl = cute.local_tile(
+            mB_nkl,
+            cute.slice_(self.mma_tiler, (0, None, None)),
+            (None, None, None),  # Half of the tile
+        )
+        # (bM, bN, RestM, RestN, RestL)
+        gAB12_mnl = cute.local_tile(mAB12_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None))
+        gC_mnl = cute.local_tile(
+            mC_mnl,
+            cute.slice_(self.mma_tiler_c, (None, None, 0)),
+            (None, None, None),
+        )
+        k_block_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
+        tCgAB12 = thr_mma.partition_C(gAB12_mnl)
+        tCgC = thr_mma.partition_C(gC_mnl)
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            cute.arch.barrier(barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta)
+
+        #
+        # Specialized TMA load warp
+        #
+
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_ab_stage)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), RestK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])]
+                # ((atom_v, rest_v), RestK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_block_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_block in cutlass.range(0, k_block_cnt, 1, unroll=1):
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_slice[(None, ab_producer_state.count)],
+                        tAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_slice[(None, ab_producer_state.count)],
+                        tBsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_block_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_ab_stage)
+            acc_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_acc_stage)
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                # Set tensor memory buffer for current tile
+                # (MMA, MMA_M, MMA_N)
+                tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
+
+                # Peek (try_wait) AB buffer full for k_block = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_block_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state)
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                #
+                # Mma mainloop
+                #
+                for k_block in cutlass.range(0, k_block_cnt, 1, unroll=1):
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        # tCtAcc += tCrA * tCrB
+                        num_kphases = cute.size(tCrA, mode=[2])
+                        for kphase_idx in cutlass.range(num_kphases, unroll_full=True):
+                            kphase_coord = (
+                                None,
+                                None,
+                                kphase_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kphase_coord],
+                                tCrB[kphase_coord],
+                                tCtAcc,
+                            )  # do something with tCtAcc1 and tCtAcc
+                            # Enable accumulate on tCtAcc after first kphase
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_block = k_block + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_block_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Async arrive accumulator buffer full
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+                acc_producer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.alloc_tmem(
+                    self.num_tmem_alloc_cols,
+                    tmem_holding_buf,
+                    is_two_cta=use_2cta_instrs,
+                )
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
+            cute.arch.barrier(
+                barrier_id=self.tmem_ptr_sync_bar_id,
+                number_of_threads=tmem_ptr_read_threads,
+            )
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype,
+                alignment=16,
+                ptr_to_buffer_holding_addr=tmem_holding_buf,
+            )
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc,
+                tTR_rAcc1,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx,
+                tCtAcc_base,
+                tCgAB12,
+                tCgC,
+                epi_tile,
+                epi_tile_c,
+                use_2cta_instrs,
+            )
+
+            tTR_rAB12 = None
+            tTR_rC = None
+            tiled_copy_r2s = None
+            tRS_rAB12 = None
+            tRS_rC = None
+            tRS_sAB12 = None
+            tRS_sC = None
+            bSG_sAB12 = None
+            bSG_sC = None
+            bSG_gAB12_partitioned = None
+            bSG_gC_partitioned = None
+            tTR_rAB12 = cute.make_rmem_tensor(tTR_rAcc.shape, self.ab12_dtype)
+            tTR_rAB12_1 = cute.make_rmem_tensor(tTR_rAcc.shape, self.ab12_dtype)
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rAB12, tRS_rAB12_1, tRS_rC, tRS_sAB12, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rAB12, tTR_rAB12_1, tTR_rC, epi_tidx, sAB12, sC
+            )
+
+            (
+                tma_atom_ab12,
+                tma_atom_c,
+                bSG_sAB12,
+                bSG_sC,
+                bSG_gAB12_partitioned,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(
+                epi_tidx,
+                tma_atom_ab12,
+                tma_atom_c,
+                tCgAB12,
+                tCgC,
+                epi_tile,
+                epi_tile_c,
+                sAB12,
+                sC,
+            )
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_acc_stage)
+
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_ab12_stage,
+                producer_group=c_producer_group,
+            )
+
+            while work_tile.is_valid_tile:
+                # Get tile coord from tile scheduler
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_mnl = (
+                    cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
+                    cur_tile_coord[1],
+                    cur_tile_coord[2],
+                )
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gAB12 = bSG_gAB12_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        *mma_tile_coord_mnl,
+                    )
+                ]
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_consumer_state.index)]
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)  # Get for the single CGA
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gAB12 = cute.group_modes(bSG_gAB12, 1, cute.rank(bSG_gAB12))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+                for subtile_idx in cutlass.range(0, subtile_cnt, 2):
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]  # input tile0
+                    tTR_tAcc_mn1 = tTR_tAcc[(None, None, None, subtile_idx + 1)]  # input tile 1
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn1, tTR_rAcc1)  # copy input tile 1
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)  # copy input tile 0
+
+                    # Convert to C type
+                    acc_vec0 = tiled_copy_r2s.retile(tTR_rAcc).load()  # copy input tile 0
+                    acc_vec1 = tiled_copy_r2s.retile(tTR_rAcc1).load()  # copy input tile 1
+                    acc_vec0 = acc_vec0 * alpha
+                    acc_vec1 = acc_vec1 * alpha
+                    # Use exp2 with log2(e) conversion since cute.math.exp is not available
+                    # exp(x) = 2^(x * log2(e))
+                    gate_rcp = (1 + cute.math.exp2(-1 * acc_vec1 * LOG2_E, True)).to(self.acc_dtype)
+
+                    res = cute.make_rmem_tensor(gate_rcp.shape, cutlass.Float32)
+                    res.store(gate_rcp)
+                    for i in cutlass.range_constexpr(cute.size(res.shape)):
+                        res[i] = cute.arch.rcp_approx(res[i])
+
+                    gate = res.load()
+                    gate = gate * acc_vec1
+
+                    acc_vec_c = (acc_vec0 * gate).to(self.c_dtype)
+
+                    acc_vec0 = (acc_vec0).to(self.ab12_dtype)
+                    acc_vec1 = (acc_vec1).to(self.ab12_dtype)
+
+                    tRS_rAB12.store(acc_vec0)  # both of them are pure Gemm Output.
+                    tRS_rAB12_1.store(acc_vec1)
+                    tRS_rC.store(acc_vec_c)
+
+                    # Store AB12 and C to shared memory
+                    ab12_buffer0 = (num_prev_subtiles + subtile_idx) % self.num_ab12_stage
+                    ab12_buffer1 = (num_prev_subtiles + subtile_idx + 1) % self.num_ab12_stage
+                    c_buffer = (num_prev_subtiles + subtile_idx // 2) % self.num_c_stage
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rAB12,
+                        tRS_sAB12[(None, None, None, ab12_buffer0)],
+                    )
+                    # copy the gemm output for bprop to smem
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rAB12_1,
+                        tRS_sAB12[(None, None, None, ab12_buffer1)],
+                    )
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    epilog_threads = 32 * len(self.epilog_warp_id)
+                    cute.arch.barrier(
+                        barrier_id=self.epilog_sync_bar_id,
+                        number_of_threads=epilog_threads,
+                    )
+
+                    # TMA store AB12 and C to global memory
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_ab12,
+                            bSG_sAB12[(None, ab12_buffer0)],
+                            bSG_gAB12[(None, subtile_idx)],
+                        )
+                        cute.copy(
+                            tma_atom_ab12,
+                            bSG_sAB12[(None, ab12_buffer1)],
+                            bSG_gAB12[(None, subtile_idx + 1)],
+                        )
+
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, subtile_idx // 2)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    cute.arch.barrier(
+                        barrier_id=self.epilog_sync_bar_id,
+                        number_of_threads=epilog_threads,
+                    )
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    acc_pipeline.consumer_release(acc_consumer_state)
+                acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            if warp_idx == self.epilog_warp_id[0]:
+                cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
+            epilog_threads = 32 * len(self.epilog_warp_id)
+            cute.arch.barrier(barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads)
+            if warp_idx == self.epilog_warp_id[0]:
+                if use_2cta_instrs:
+                    cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1)
+                    cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+                cute.arch.dealloc_tmem(tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gAB12_mnl: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        epi_tile_c: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gAB12_mnl: The global tensor AB12
+        :type gAB12_mnl: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param epi_tile_c: The epilogue tiler for C
+        :type epi_tile_c: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.ab12_layout,
+            self.ab12_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gAB12_mnl_epi = cute.flat_divide(gAB12_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        tTR_gAB12 = thr_copy_t2r.partition_D(gAB12_mnl_epi)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_rmem_tensor(tTR_gAB12[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        tTR_rAcc1 = cute.make_rmem_tensor(tTR_gAB12[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc, tTR_rAcc1
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rAB12: cute.Tensor,
+        tTR_rAB12_1: cute.Tensor,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sAB12: cute.Tensor,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rAB12: The partitioned accumulator tensor for AB12
+        :type tTR_rAB12: cute.Tensor
+        :param tTR_rAB12_1: The partitioned accumulator tensor for AB12 (second tile)
+        :type tTR_rAB12_1: cute.Tensor
+        :param tTR_rC: The partitioned accumulator tensor for C
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sAB12: The shared memory tensor for AB12
+        :type sAB12: cute.Tensor
+        :param sC: The shared memory tensor for C
+        :type sC: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rAB12, tRS_rAB12_1, tRS_rC, tRS_sAB12, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rAB12: The partitioned tensor AB12 (register source)
+            - tRS_sAB12: The partitioned tensor AB12 (smem destination)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(self.ab12_layout, self.ab12_dtype, self.acc_dtype, tiled_copy_t2r)
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sAB12 = thr_copy_r2s.partition_D(sAB12)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rAB12 = tiled_copy_r2s.retile(tTR_rAB12)
+        tRS_rAB12_1 = tiled_copy_r2s.retile(tTR_rAB12_1)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rAB12, tRS_rAB12_1, tRS_rC, tRS_sAB12, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom1: Union[cute.CopyAtom, cute.TiledCopy],
+        atom2: Union[cute.CopyAtom, cute.TiledCopy],
+        gAB12_mnl: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        epi_tile_c: cute.Tile,
+        sAB12: cute.Tensor,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.CopyAtom, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom1: The copy_atom for AB12 TMA store
+        :type atom1: cute.CopyAtom or cute.TiledCopy
+        :param atom2: The copy_atom for C TMA store
+        :type atom2: cute.CopyAtom or cute.TiledCopy
+        :param gAB12_mnl: The global tensor AB12
+        :type gAB12_mnl: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler for AB12
+        :type epi_tile: cute.Tile
+        :param epi_tile_c: The epilogue tiler for C
+        :type epi_tile_c: cute.Tile
+        :param sAB12: The shared memory tensor for AB12
+        :type sAB12: cute.Tensor
+        :param sC: The shared memory tensor for C
+        :type sC: cute.Tensor
+
+        :return: A tuple containing:
+            - tma_atom_ab12: The TMA copy atom for AB12
+            - tma_atom_c: The TMA copy atom for C
+            - bSG_sAB12: The partitioned shared memory tensor AB12
+            - bSG_sC: The partitioned shared memory tensor C
+            - bSG_gAB12: The partitioned global tensor AB12
+            - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.CopyAtom, cute.Tensor, cute.Tensor, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
+        gAB12_epi = cute.flat_divide(gAB12_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile_c)
+        tma_atom_ab12 = atom1
+        tma_atom_c = atom2
+        sAB12_for_tma_partition = cute.group_modes(sAB12, 0, 2)
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gAB12_for_tma_partition = cute.group_modes(gAB12_epi, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, RestM, RestN, RestL)
+        bSG_sAB12, bSG_gAB12 = cpasync.tma_partition(
+            tma_atom_ab12,
+            0,
+            cute.make_layout(1),
+            sAB12_for_tma_partition,
+            gAB12_for_tma_partition,
+        )
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_ab12, tma_atom_c, bSG_sAB12, bSG_sC, bSG_gAB12, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        epi_tile_c: cute.Tile,
+        ab12_dtype: Type[cutlass.Numeric],
+        ab12_layout: utils.LayoutEnum,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/AB12/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape for AB12.
+        :type epi_tile: cute.Tile
+        :param epi_tile_c: The epilogue tile shape for C.
+        :type epi_tile_c: cute.Tile
+        :param ab12_dtype: Data type of operand AB12 (full GEMM output).
+        :type ab12_dtype: type[cutlass.Numeric]
+        :param ab12_layout: Layout enum of operand AB12.
+        :type ab12_layout: utils.LayoutEnum
+        :param c_dtype: Data type of operand C (SwiGLU output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout enum of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param smem_capacity: Total available shared memory capacity in bytes.
+        :type smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, AB12 stages, C stages)
+        :rtype: tuple[int, int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 2
+
+        # Default epilogue stages (TMA store always enabled)
+        num_ab12_stage = 4
+        num_c_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, AB12, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        ab12_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            ab12_dtype,
+            ab12_layout,
+            epi_tile,
+            1,
+        )
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile_c,
+            1,
+        )
+        ab_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+        mbar_helpers_bytes = 1024
+        ab12_bytes_per_stage = cute.size_in_bytes(ab12_dtype, ab12_smem_layout_staged_one)
+        ab12_bytes = ab12_bytes_per_stage * num_ab12_stage
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial AB12/C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (smem_capacity // occupancy - (mbar_helpers_bytes + ab12_bytes + c_bytes)) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        # num_ab12_stage += (
+        #    smem_capacity
+        #    - occupancy * ab_bytes_per_stage * num_ab_stage
+        #    - occupancy * (mbar_helpers_bytes + ab12_bytes)
+        # ) // (occupancy * ab12_bytes_per_stage)
+
+        # Assert: Check total shared memory usage doesn't exceed capacity
+        total_ab_smem = occupancy * ab_bytes_per_stage * num_ab_stage
+        total_output_smem = occupancy * (ab12_bytes_per_stage * num_ab12_stage + c_bytes_per_stage * num_c_stage)
+        total_smem_used = total_ab_smem + total_output_smem + occupancy * mbar_helpers_bytes
+
+        return num_acc_stage, num_ab_stage, num_ab12_stage, num_c_stage
+
+    @staticmethod
+    def _compute_grid(
+        ab12: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor AB12.
+
+        :param ab12: The output tensor AB12
+        :type ab12: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        ab12_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gab12 = cute.zipped_divide(ab12, tiler=ab12_shape)
+        num_ctas_mnl = gab12[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl)
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(tile_sched_params, max_active_clusters)
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _compute_num_tmem_alloc_cols(
+        tiled_mma: cute.TiledMma,
+        mma_tiler: Tuple[int, int, int],
+        num_acc_stage: int,
+    ) -> int:
+        """
+        Compute the number of tensor memory allocation columns.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler: The shape (M, N, K) of the MMA tile.
+        :type mma_tiler: tuple[int, int, int]
+        :param num_acc_stage: The stage of the accumulator tensor.
+        :type num_acc_stage: int
+
+        :return: The number of tensor memory allocation columns.
+        :rtype: int
+        """
+        acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2])
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage))
+        num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols(tCtAcc_fake)
+
+        return num_tmem_alloc_cols
+
+
+class PersistentDenseGemmKernelNoDlpack:
+    """Wrapper around PersistentDenseGemmKernel that avoids DLPack.
+
+    This wrapper constructs cute.Tensors directly from cute.Pointer, shapes, and
+    explicit layout orders for operands A, B, AB12 and C.
+    """
+
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ):
+        self.kernel = PersistentDenseGemmKernel(
+            acc_dtype=acc_dtype,
+            use_2cta_instrs=use_2cta_instrs,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_ptr: cute.Pointer,
+        a_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        a_order: cutlass.Constexpr[Tuple[int, int, int]],
+        b_ptr: cute.Pointer,
+        b_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        b_order: cutlass.Constexpr[Tuple[int, int, int]],
+        ab12_ptr: cute.Pointer,
+        ab12_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        ab12_order: cutlass.Constexpr[Tuple[int, int, int]],
+        c_cute: cute.Tensor,
+        alpha: cutlass.Float32,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x / (1 + math.exp(-x, True)),
+    ):
+        a_cute = cute.make_tensor(a_ptr, layout=cute.make_ordered_layout(a_shape, order=a_order))
+        b_cute = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout(b_shape, order=b_order))
+        ab12_cute = cute.make_tensor(ab12_ptr, layout=cute.make_ordered_layout(ab12_shape, order=ab12_order))
+        self.kernel(
+            a_cute,
+            b_cute,
+            ab12_cute,
+            c_cute,
+            alpha,
+            max_active_clusters,
+            stream,
+            epilogue_op,
+        )
diff --git a/third_party/cudnn-frontend/python/cudnn/graph.py b/third_party/cudnn-frontend/python/cudnn/graph.py
new file mode 100644
index 00000000..e2752acb
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/graph.py
@@ -0,0 +1,104 @@
+import cudnn
+from contextlib import contextmanager
+from typing import Optional, List, Union, Callable
+from functools import wraps
+import warnings
+
+
+def graph_cache(key_fn, maxsize=256):
+    """Custom caching decorator that uses a provided key function
+
+    Args:
+        key_fn: Function that generates cache key from the input arguments
+        maxsize: Maximum size of the cache
+    """
+
+    def decorator(func):
+        cache = {}
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            key = key_fn(*args, **kwargs)
+            if key in cache:
+                return cache[key]
+
+            result = func(*args, **kwargs)
+            if len(cache) >= maxsize:
+                # Remove oldest item if cache is full
+                cache.pop(next(iter(cache)))
+            cache[key] = result
+            return result
+
+        return wrapper
+
+    return decorator
+
+
+def jit(
+    heur_modes: Union[List[cudnn.heur_mode], cudnn.heur_mode] = cudnn.heur_mode.A,
+    **kwargs,
+) -> Callable:
+    """
+    Decorator that automatically builds a graph with specified heuristic modes.
+
+    Args:
+        heur_modes: Single heuristic mode or list of modes for graph building.
+        **kwargs: Additional configuration options for graph building.
+
+    Returns:
+        Callable: Decorated context manager function that returns (graph, tensor_uids).
+
+    Example:
+        >>> handle = cudnn.create_handle()
+        >>> @cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.B])
+        ... def my_graph():
+        ...     with graph(handle) as g:
+        ...         X = g.tensor(name="X", dim=[8, 64, 56, 56],
+        ...                     stride=[56*56*64, 1, 56*64, 64])
+        ...         return g, [X]  # Return graph and list of tensors to get UIDs for
+    """
+    if not isinstance(heur_modes, list):
+        heur_modes = [heur_modes]
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            g, tensors = func(*args, **kwargs)  # Get the result
+            if g.get_execution_plan_count() <= 0:
+                g.build(heur_modes)  # Build the graph
+            return g, [t.get_uid() for t in tensors]  # Convert tensors to UIDs
+
+        return wrapper
+
+    return decorator
+
+
+@contextmanager
+def graph(
+    handle: object,
+    name: str = "cudnn_graph",
+    io_data_type: cudnn.data_type = cudnn.data_type.HALF,
+    intermediate_data_type: cudnn.data_type = cudnn.data_type.FLOAT,
+    compute_data_type: cudnn.data_type = cudnn.data_type.FLOAT,
+) -> cudnn.pygraph:
+    """
+    Context manager for creating and managing a CUDNN graph object.
+
+    Args:
+        handle: CUDNN handle created with cudnn.create_handle().
+        name: Name of the graph for debugging purposes.
+        io_data_type: Data type for input/output tensors.
+        compute_data_type: Data type for computation.
+
+    Yields:
+        Tuple[cudnn.pygraph, List]: (graph object, list of tensors to get UIDs for)
+    """
+    g = cudnn.pygraph(
+        handle=handle,
+        name=name,
+        io_data_type=io_data_type,
+        intermediate_data_type=intermediate_data_type,
+        compute_data_type=compute_data_type,
+    )
+
+    yield g, []
diff --git a/third_party/cudnn-frontend/python/cudnn/grouped_gemm/__init__.py b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/__init__.py
new file mode 100644
index 00000000..bba4367e
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .grouped_gemm_swiglu.api import (
+    GroupedGemmSwigluSm100,
+    grouped_gemm_swiglu_wrapper_sm100,
+)
+
+__all__ = [
+    "GroupedGemmSwigluSm100",
+    "grouped_gemm_swiglu_wrapper_sm100",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/__init__.py b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/__init__.py
new file mode 100644
index 00000000..36c52656
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Grouped GEMM SwiGLU Kernel Module
+
+This module provides the forward grouped GEMM with SwiGLU activation
+for MoE (Mixture of Experts) workloads on SM100+ GPUs.
+"""
+
+from .api import (
+    GroupedGemmSwigluSm100,
+    grouped_gemm_swiglu_wrapper_sm100,
+)
+
+__all__ = [
+    "GroupedGemmSwigluSm100",
+    "grouped_gemm_swiglu_wrapper_sm100",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/api.py b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/api.py
new file mode 100644
index 00000000..eb4aec79
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/api.py
@@ -0,0 +1,999 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+API for Grouped GEMM SwiGLU Forward Kernel (SM100+)
+
+This module provides the API class for contiguous grouped block-scaled GEMM
+with SwiGLU activation for MoE (Mixture of Experts) workloads.
+"""
+
+from .grouped_gemm_swiglu_quant import (
+    BlockScaledContiguousGroupedGemmKernel,
+    BlockScaledContiguousGroupedGemmKernelNoDlpack,
+)
+from cuda.bindings import driver as cuda
+import torch
+from typing import Tuple, Optional
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack, make_ptr
+from packaging import version
+
+from cudnn.datatypes import _convert_to_cutlass_data_type
+from cudnn.api_base import APIBase, TupleDict, ceil_div, is_power_of_2
+
+
+class GroupedGemmSwigluSm100(APIBase):
+    """API class for Grouped GEMM SwiGLU forward operation on SM100+ GPUs.
+
+    This kernel performs contiguous grouped block-scaled GEMM with SwiGLU activation,
+    designed for MoE (Mixture of Experts) workloads.
+
+    Key features:
+    - Supports variable M per group (aligned to cta_tile_m)
+    - Contiguous memory layout for A and D tensors
+    - Block-scaled quantization support (MXF8, MXF4, NVF4)
+
+    Example:
+        >>> api = GroupedGemmSwigluSm100(
+        ...     sample_a=a_tensor,
+        ...     ...
+        ... )
+        >>> api.check_support()
+        >>> api.compile()
+        >>> api.execute(..., stream)
+    """
+
+    def __init__(
+        self,
+        sample_a: torch.Tensor,
+        sample_b: torch.Tensor,
+        sample_c: torch.Tensor,
+        sample_d: torch.Tensor,
+        sample_sfa: torch.Tensor,
+        sample_sfb: torch.Tensor,
+        sample_tile_idx_to_expert_idx: torch.Tensor,
+        sample_num_non_exiting_tiles: torch.Tensor,
+        sample_alpha: torch.Tensor,
+        # Required quantization output (column-quantized D tensor)
+        sample_d_col: torch.Tensor,
+        # Optional quantization output arguments
+        sample_sfd_row: Optional[torch.Tensor] = None,
+        sample_sfd_col: Optional[torch.Tensor] = None,
+        sample_amax: Optional[torch.Tensor] = None,
+        sample_norm_const: Optional[torch.Tensor] = None,
+        sample_prob: Optional[torch.Tensor] = None,
+        sample_m_split_cumsum: Optional[torch.Tensor] = None,
+        # Configuration
+        acc_dtype: torch.dtype = torch.float32,
+        mma_tiler_mn: Tuple[int, int] = (256, 256),
+        cluster_shape_mn: Optional[Tuple[int, int]] = None,
+        sf_vec_size: int = 16,
+        vector_f32: bool = False,
+        m_aligned: int = 256,
+        discrete_col_sfd: bool = False,
+    ):
+        """Initialize the GroupedGemmSwigluSm100 API.
+
+        :param sample_a: Sample A tensor (valid_m, k, 1)
+        :param sample_b: Sample B tensor (n, k, l) where l = num_groups
+        :param sample_c: Sample C tensor for intermediate storage
+        :param sample_d: Sample D output tensor (valid_m, n/2, 1) after SwiGLU
+        :param sample_sfa: Sample scale factor A tensor
+        :param sample_sfb: Sample scale factor B tensor
+        :param sample_tile_idx_to_expert_idx: Mapping from tile index to expert/group index
+        :param sample_num_non_exiting_tiles: Number of valid tiles
+        :param sample_alpha: Per-group alpha scaling factors
+        :param sample_d_col: Column-quantized D tensor (required for quant kernel)
+        :param sample_sfd_row: Optional row scale factor for D
+        :param sample_sfd_col: Optional column scale factor for D
+        :param sample_amax: Optional amax tensor for quantization
+        :param sample_norm_const: Optional normalization constant
+        :param sample_prob: Optional probability tensor for gating
+        :param sample_m_split_cumsum: Optional m split cumulative sum tensor. Required when discrete_col_sfd is True.
+        :param acc_dtype: Accumulator data type
+        :param mma_tiler_mn: MMA tiler shape (M, N)
+        :param cluster_shape_mn: Cluster shape (M, N)
+        :param sf_vec_size: Scale factor vector size
+        :param vector_f32: Use vectorized f32 operations
+        :param m_aligned: Alignment for group M dimension
+        :param discrete_col_sfd: Boolean, True to generate discrete col-major scale factor tensor. Only applies when already output scale factor tensors are provided.
+        """
+        super().__init__()
+
+        self._logger.warning("GroupedGemmSwigluSm100 is an experimental API")
+        self._logger.debug("Entering __init__")
+
+        # Store sample tensors
+        self.sample_a = sample_a
+        self.sample_b = sample_b
+        self.sample_c = sample_c
+        self.sample_d = sample_d
+        self.sample_sfa = sample_sfa
+        self.sample_sfb = sample_sfb
+        self.sample_tile_idx_to_expert_idx = sample_tile_idx_to_expert_idx
+        self.sample_num_non_exiting_tiles = sample_num_non_exiting_tiles
+        self.sample_alpha = sample_alpha
+
+        # Optional quantization outputs
+        self.sample_d_col = sample_d_col
+        self.sample_sfd_row = sample_sfd_row
+        self.sample_sfd_col = sample_sfd_col
+        self.sample_amax = sample_amax
+        self.sample_norm_const = self._unpad_tensor_to_ndim(sample_norm_const, 1, "norm_const")
+        self.sample_prob = sample_prob
+        self.sample_m_split_cumsum = sample_m_split_cumsum
+
+        # Configuration
+        self.acc_dtype = acc_dtype
+        self.mma_tiler_mn = mma_tiler_mn
+        self.use_2cta_instrs = mma_tiler_mn[0] == 256
+        if cluster_shape_mn is None:
+            self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+        else:
+            self.cluster_shape_mn = cluster_shape_mn
+        self.sf_vec_size = sf_vec_size
+        self.vector_f32 = vector_f32
+        self.m_aligned = m_aligned
+        self.discrete_col_sfd = discrete_col_sfd
+
+        # Determine kernel variant based on sample tensor dtypes
+        # NoDlpack kernels are required for:
+        # - FP4 dtypes (any of ab_dtype, c_dtype, d_dtype)
+        # - FP8 dtypes on PyTorch < 2.10.0
+        ab_dtype = self.sample_a.dtype
+        c_dtype = self.sample_c.dtype
+        d_dtype = self.sample_d.dtype
+        torch_version = version.parse(torch.__version__)
+        is_ab_fp4 = self._is_fp4x2(ab_dtype)
+        is_c_fp4 = self._is_fp4x2(c_dtype)
+        is_d_fp4 = self._is_fp4x2(d_dtype)
+        is_ab_fp8 = self._is_fp8(ab_dtype)
+        is_c_fp8 = self._is_fp8(c_dtype)
+        is_d_fp8 = self._is_fp8(d_dtype)
+        _fp8_dlpack_supported = version.parse(torch_version.base_version) >= version.parse("2.10.0")
+        use_no_dlpack_kernel = is_ab_fp4 or is_c_fp4 or is_d_fp4 or ((is_ab_fp8 or is_c_fp8 or is_d_fp8) and not _fp8_dlpack_supported)
+
+        if use_no_dlpack_kernel:
+            self._logger.debug("Using NoDlpack kernel due to FP4 dtype or FP8 dtype on incompatible torch version")
+            self._kernel = BlockScaledContiguousGroupedGemmKernelNoDlpack
+        else:
+            self._kernel = BlockScaledContiguousGroupedGemmKernel
+
+        self._interpret_uint8_as_fp4x2 = True
+        self._logger.debug(f"__init__ completed")
+
+    def check_support(self) -> bool:
+        """Check if the kernel configuration is supported.
+
+        :return: True if supported, raises exception otherwise
+        """
+        self._logger.debug("Entering check_support")
+
+        all_none = all(x is None for x in [self.sample_sfd_row, self.sample_sfd_col, self.sample_norm_const])
+        none_none = all(x is not None for x in [self.sample_sfd_row, self.sample_sfd_col, self.sample_norm_const])
+        self._value_error_if(
+            not (all_none or none_none),
+            "sample_sfd_row, sample_sfd_col, and norm_const must be all None or all not None",
+        )
+        self.generate_sfd = none_none
+        if self.discrete_col_sfd and not self.generate_sfd:
+            self._logger.warning("discrete_col_sfd is True but generate_sfd is False, discrete_col_sfd will be ignored")
+            self.discrete_col_sfd = False
+        self._value_error_if(self.discrete_col_sfd and self.sample_m_split_cumsum is None, "sample_m_split_cumsum is required when discrete_col_sfd is True")
+
+        self._logger.debug("Checking tensor shapes and strides")
+        tensor_m, k, _one = self._tensor_shape(self.sample_a, name="sample_a")
+        n, _, l = self._tensor_shape(self.sample_b, name="sample_b")
+        _, _, _one = self._tensor_shape(self.sample_c, name="sample_c")
+        _, n_2, _one = self._tensor_shape(self.sample_d, name="sample_d")
+
+        self._check_tensor_shape(self.sample_a, (tensor_m, k, 1), "A")
+        self._check_tensor_shape(self.sample_b, (n, k, l), "B")
+        self._check_tensor_shape(self.sample_c, (tensor_m, n, 1), "C")
+        self._check_tensor_shape(self.sample_d, (tensor_m, n // 2, 1), "D")
+
+        self._check_tensor_shape(self.sample_d_col, (tensor_m, n // 2, 1), "D_col")
+
+        rest_k = ceil_div(ceil_div(k, self.sf_vec_size), 4)
+        self._check_tensor_shape(self.sample_sfa, (32, 4, ceil_div(tensor_m, 128), 4, rest_k, 1), "SFA")
+        self._check_tensor_shape(self.sample_sfb, (32, 4, ceil_div(n, 128), 4, rest_k, l), "SFB")
+        rest_n2 = ceil_div(ceil_div(n // 2, self.sf_vec_size), 4)
+        self._check_tensor_shape(self.sample_sfd_row, (32, 4, ceil_div(tensor_m, 128), 4, rest_n2, 1), "SFD_row")
+        rest_m = ceil_div(ceil_div(tensor_m, self.sf_vec_size), 4)
+        self._check_tensor_shape(self.sample_sfd_col, (32, 4, ceil_div(n // 2, 128), 4, rest_m, 1), "SFD_col")
+
+        self._check_tensor_shape(self.sample_alpha, (l,), "alpha")
+        self._check_tensor_shape(self.sample_prob, (tensor_m, 1, 1), "prob")
+        self._check_tensor_shape(self.sample_amax, (l, 1), "amax")
+        self._check_tensor_shape(self.sample_num_non_exiting_tiles, (1,), "num_non_exiting_tiles")
+        self._check_tensor_shape(self.sample_norm_const, (1,), "norm_const")
+        self._check_tensor_shape(self.sample_m_split_cumsum, (l + 1,), "m_split_cumsum")
+
+        _, self.a_stride_order = self._check_tensor_stride(self.sample_a, stride=[(k, 1, tensor_m * k)], extra_error_msg="A must have k-major layout")
+        _, self.b_stride_order = self._check_tensor_stride(self.sample_b, stride=[(k, 1, n * k)], extra_error_msg="B must have k-major layout")
+        _, self.c_stride_order = self._check_tensor_stride(self.sample_c, stride=[(n, 1, tensor_m * n)], extra_error_msg="C must have n-major layout")
+        _, self.d_stride_order = self._check_tensor_stride(self.sample_d, stride=[(n_2, 1, tensor_m * n_2)], extra_error_msg="D must have n-major layout")
+        _, self.d_col_stride_order = self._check_tensor_stride(
+            self.sample_d_col, stride=[(n_2, 1, tensor_m * n_2)], extra_error_msg="D_col must have n-major layout"
+        )
+        self.cd_stride_order = self.c_stride_order
+
+        self._logger.debug("Checking data types")
+        self.ab_dtype = self._check_dtype(
+            self.sample_a,
+            dtype=[
+                torch.float4_e2m1fn_x2,
+                torch.uint8,
+                torch.float8_e5m2,
+                torch.float8_e4m3fn,
+            ],
+            name="A/B",
+        )
+        self._check_dtype(self.sample_b, dtype=self.ab_dtype, name="B", extra_error_msg="B must have the same dtype as A")
+
+        self.sf_dtype = self._check_dtype(
+            self.sample_sfa,
+            dtype=[torch.float8_e8m0fnu, torch.float8_e4m3fn],
+            name="SFA/SFB/SFD_row/SFD_col",
+        )
+        self._check_dtype(self.sample_sfb, dtype=self.sf_dtype, name="SFB", extra_error_msg="SFB must have the same dtype as SFA")
+        self._check_dtype(self.sample_sfd_row, dtype=self.sf_dtype, name="SFD_row", extra_error_msg="SFD_row must have the same dtype as SFA")
+        self._check_dtype(self.sample_sfd_col, dtype=self.sf_dtype, name="SFD_col", extra_error_msg="SFD_col must have the same dtype as SFA")
+
+        self._value_error_if(self.sf_vec_size not in [16, 32], f"sf_vec_size must be 16 or 32, got {self.sf_vec_size}")
+        self._value_error_if(
+            self.sf_dtype in [torch.float8_e4m3fn] and self.sf_vec_size == 32,
+            f"sf_dtype {self.sf_dtype} and sf_vec_size {self.sf_vec_size} combination is not supported",
+        )
+        self._value_error_if(
+            self._is_fp8(self.ab_dtype) and self.sf_vec_size == 16, f"ab_dtype {self.ab_dtype} and sf_vec_size {self.sf_vec_size} combination is not supported"
+        )
+
+        self._check_dtype(self.acc_dtype, dtype=torch.float32, name="Accumulator", extra_error_msg="Accumulator must be float32")
+        self.c_dtype = self._check_dtype(
+            self.sample_c,
+            dtype=[torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn, torch.float8_e5m2, torch.float4_e2m1fn_x2],
+            name="C",
+            extra_error_msg="C must have the same dtype as A",
+        )
+
+        if self._is_fp4x2(self.ab_dtype):
+            self.d_dtype = self._check_dtype(
+                self.sample_d, dtype=[torch.bfloat16, torch.float32], name="D", extra_error_msg="D must be bf16 or float32 when ab_dtype is fp4"
+            )
+        else:
+            self.d_dtype = self._check_dtype(
+                self.sample_d,
+                dtype=[
+                    torch.float16,
+                    torch.bfloat16,
+                    torch.float8_e4m3fn,
+                    torch.float8_e5m2,
+                    torch.float4_e2m1fn_x2,
+                ],  # torch.float32 fails non-deterministicly
+                name="D",
+            )
+        self._check_dtype(self.sample_d_col, dtype=self.d_dtype, name="D_col", extra_error_msg="D_col must have the same dtype as D")
+
+        self._not_implemented_error_if(
+            self._is_fp4x2(self.ab_dtype) and self.sf_vec_size == 16 and self.d_dtype == torch.float32,  # Fails to compile
+            f"Invalid configuration: fp4 ab_dtype, sf_vec_size 16, d_dtype float32 is not supported. Please use sf_vec_size 32 or d_dtype bf16 instead",
+        )
+
+        self._logger.debug("Checking MMA tile shape and cluster shape")
+        self._value_error_if(
+            not self.use_2cta_instrs and self.mma_tiler_mn[0] not in [64, 128],
+            f"MMA tiler M must be 64 or 128 when use_2cta_instrs=False, got {self.mma_tiler_mn[0]}",
+        )
+        self._value_error_if(
+            self.use_2cta_instrs and self.mma_tiler_mn[0] not in [128, 256],
+            f"MMA tiler M must be 128 or 256 when use_2cta_instrs=True, got {self.mma_tiler_mn[0]}",
+        )
+        self._value_error_if(self.mma_tiler_mn[1] not in [128, 256], f"MMA tiler N must be 128 or 256, got {self.mma_tiler_mn[1]}")
+        self._value_error_if(
+            self.cluster_shape_mn[0] % (2 if self.use_2cta_instrs else 1) != 0,
+            f"cluster_shape_mn[0] must be divisible by 2 when use_2cta_instrs=True, got {self.cluster_shape_mn[0]}",
+        )
+        self._value_error_if(
+            not (
+                self.cluster_shape_mn[0] * self.cluster_shape_mn[1] <= 16
+                and self.cluster_shape_mn[0] > 0
+                and self.cluster_shape_mn[1] > 0
+                and self.cluster_shape_mn[0] <= 4
+                and self.cluster_shape_mn[1] <= 4
+                and is_power_of_2(self.cluster_shape_mn[0])
+                and is_power_of_2(self.cluster_shape_mn[1])
+            ),
+            f"Invalid cluster shape: expected values to be powers of 2 and cluster_shape_mn[0] * cluster_shape_mn[1] <= 16, got {self.cluster_shape_mn[0]},{self.cluster_shape_mn[1]}",
+        )
+        cluster_tiler_m = (self.cluster_shape_mn[0] // (2 if self.use_2cta_instrs else 1)) * self.mma_tiler_mn[0]
+        # Skip invalid cluster tiler shape since contiguous layout can't handle oob access
+        # The contiguous layout means the aligned data is stored in a contiguous manner.
+        # It can't handle runtime oob when alignment is not align with the tile_M,
+        # since the problem shape of TMA store can't be changed at runtime.
+        self._value_error_if(cluster_tiler_m not in [128, 256], f"Invalid cluster tiler shape: expected cluster_tiler_m in {{128, 256}}, got {cluster_tiler_m}")
+        # Check if m_aligned is a multiple of cluster_tiler_m
+        # This ensures that each group's M dimension (which is a multiple of m_aligned)
+        # won't be split across tiles, preventing a single tile from loading data
+        # from multiple groups (which would access wrong B matrix data)
+        self._value_error_if(
+            self.m_aligned % self.mma_tiler_mn[0] != 0,
+            f"Invalid m_aligned: expected m_aligned to be divisible by mma_tiler_mn[0], got {self.m_aligned} % {self.mma_tiler_mn[0]} != 0",
+        )
+
+        self._logger.debug("Checking tensor alignment")
+
+        def check_contigous_16B_alignment(dtype, stride_order, tensor_shape):
+            is_mode0_major = stride_order == (0, 1, 2)
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // (_convert_to_cutlass_data_type(dtype, interpret_uint8_as_fp4x2=self._interpret_uint8_as_fp4x2).width)
+            return num_major_elements % num_contiguous_elements == 0
+
+        self._value_error_if(
+            not (
+                check_contigous_16B_alignment(self.ab_dtype, self.a_stride_order, (tensor_m, k, l))
+                and check_contigous_16B_alignment(self.ab_dtype, self.b_stride_order, (n, k, l))
+                and check_contigous_16B_alignment(self.d_dtype, self.cd_stride_order, (tensor_m, n, l))
+            ),
+            "Invalid tensor alignment: tensors must be 16B aligned",
+        )
+
+        # Disabled configurations
+        self._not_implemented_error_if(
+            (self._is_fp8(self.ab_dtype)) and (self.mma_tiler_mn[1] == 128) and (self._is_fp8(self.d_dtype)),
+            f"Invalid configuration: fp8 ab_dtype and sf_vec_size 32 with mma_tiler_mn[1] == 128 and fp8 d_dtype is not supported"
+            + f"Please use mma_tiler_mn[1] == 256 instead",
+        )
+        self._not_implemented_error_if(
+            self._is_fp4x2(self.ab_dtype) and (self.c_dtype not in [torch.float16, torch.bfloat16]),
+            f"Invalid configuration: for fp4 ab_dtype, c_dtype must be float16 or bfloat16, got {self.c_dtype}",
+        )
+
+        # Check environment
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        device = torch.cuda.current_device()
+        major, minor = torch.cuda.get_device_capability(device)
+        compute_capability = major * 10 + minor
+        if compute_capability < 100:
+            raise RuntimeError(f"GroupedGemmSwiglu requires SM100+ compute capability, " f"but found SM{compute_capability} on device {device}")
+
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        """Compile the kernel.
+
+        :param current_stream: CUDA stream to use
+        """
+        self._logger.debug("Entering compile")
+        current_stream = self._get_default_stream(current_stream)
+        self._ensure_support_checked()
+
+        gemm_swiglu = self._kernel(
+            sf_vec_size=self.sf_vec_size,
+            acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+            use_2cta_instrs=self.use_2cta_instrs,
+            mma_tiler_mn=self.mma_tiler_mn,
+            cluster_shape_mn=self.cluster_shape_mn,
+            vector_f32=self.vector_f32,
+            generate_sfd=self.generate_sfd,
+            discrete_col_sfd=self.discrete_col_sfd,
+        )
+
+        hardware_info = cutlass.utils.HardwareInfo()
+        max_active_clusters = hardware_info.get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1])
+
+        if self._kernel is BlockScaledContiguousGroupedGemmKernel:
+            self._logger.debug("Compiling grouped_gemm_swiglu kernel (dlpack)")
+            self._compiled_kernel = cute.compile(
+                gemm_swiglu,
+                a=from_dlpack(self.sample_a, assumed_align=16),
+                b=from_dlpack(self.sample_b, assumed_align=16),
+                c=from_dlpack(self.sample_c, assumed_align=16),
+                d=from_dlpack(self.sample_d, assumed_align=16),
+                d_col=from_dlpack(self.sample_d_col, assumed_align=16) if self.sample_d_col is not None else None,
+                sfa=from_dlpack(self.sample_sfa, assumed_align=16),
+                sfb=from_dlpack(self.sample_sfb, assumed_align=16),
+                sfd_row_tensor=from_dlpack(self.sample_sfd_row, assumed_align=16) if self.sample_sfd_row is not None else None,
+                sfd_col_tensor=from_dlpack(self.sample_sfd_col, assumed_align=16) if self.sample_sfd_col is not None else None,
+                amax_tensor=from_dlpack(self.sample_amax, assumed_align=16) if self.sample_amax is not None else None,
+                norm_const_tensor=from_dlpack(self.sample_norm_const) if self.sample_norm_const is not None else None,
+                tile_idx_to_expert_idx=from_dlpack(self.sample_tile_idx_to_expert_idx, assumed_align=16),
+                num_non_exiting_tiles=from_dlpack(self.sample_num_non_exiting_tiles, assumed_align=16),
+                m_split_cumsum=from_dlpack(self.sample_m_split_cumsum, assumed_align=16) if self.sample_m_split_cumsum is not None else None,
+                alpha=from_dlpack(self.sample_alpha, assumed_align=16),
+                prob=from_dlpack(self.sample_prob, assumed_align=16) if self.sample_prob is not None else None,
+                max_active_clusters=max_active_clusters,
+                stream=current_stream,
+            )
+        elif self._kernel is BlockScaledContiguousGroupedGemmKernelNoDlpack:
+            self._logger.debug("Compiling grouped_gemm_swiglu kernel (no_dlpack)")
+            # Create cute pointers/tensors manually to avoid DLPack requirements
+            a_ptr, a_shape, a_order = self._make_cute_tensor_descriptor(self.sample_a, name="A")
+            b_ptr, b_shape, b_order = self._make_cute_tensor_descriptor(self.sample_b, name="B")
+            c_ptr, c_shape, c_order = self._make_cute_tensor_descriptor(self.sample_c, name="C")
+            d_ptr, d_shape, d_order = self._make_cute_tensor_descriptor(self.sample_d, name="D")
+            d_col_ptr, d_col_shape, d_col_order = self._make_cute_tensor_descriptor(self.sample_d_col, name="D_col")
+            sfa_ptr, sfa_shape, sfa_order = self._make_cute_tensor_descriptor(self.sample_sfa, name="SFA")
+            sfb_ptr, sfb_shape, sfb_order = self._make_cute_tensor_descriptor(self.sample_sfb, name="SFB")
+            sfd_row_ptr, sfd_row_shape, sfd_row_order = self._make_cute_tensor_descriptor(self.sample_sfd_row, name="SFD_row")
+            sfd_col_ptr, sfd_col_shape, sfd_col_order = self._make_cute_tensor_descriptor(self.sample_sfd_col, name="SFD_col")
+            amax_ptr, amax_shape, amax_order = self._make_cute_tensor_descriptor(self.sample_amax, name="amax")
+            norm_const_ptr, norm_const_shape, norm_const_order = self._make_cute_tensor_descriptor(self.sample_norm_const, name="norm_const")
+            tile_idx_ptr, tile_idx_shape, tile_idx_order = self._make_cute_tensor_descriptor(self.sample_tile_idx_to_expert_idx, name="tile_idx")
+            num_tiles_ptr, num_tiles_shape, num_tiles_order = self._make_cute_tensor_descriptor(self.sample_num_non_exiting_tiles, name="num_tiles")
+            m_split_cumsum_ptr, m_split_cumsum_shape, m_split_cumsum_order = self._make_cute_tensor_descriptor(
+                self.sample_m_split_cumsum, name="m_split_cumsum"
+            )
+            alpha_ptr, alpha_shape, alpha_order = self._make_cute_tensor_descriptor(self.sample_alpha, name="alpha")
+            prob_ptr, prob_shape, prob_order = self._make_cute_tensor_descriptor(self.sample_prob, name="prob")
+
+            self._compiled_kernel = cute.compile(
+                gemm_swiglu,
+                a_ptr=a_ptr,
+                a_shape=a_shape,
+                a_order=a_order,
+                b_ptr=b_ptr,
+                b_shape=b_shape,
+                b_order=b_order,
+                c_ptr=c_ptr,
+                c_shape=c_shape,
+                c_order=c_order,
+                d_ptr=d_ptr,
+                d_shape=d_shape,
+                d_order=d_order,
+                d_col_ptr=d_col_ptr,
+                d_col_shape=d_col_shape,
+                d_col_order=d_col_order,
+                sfa_ptr=sfa_ptr,
+                sfa_shape=sfa_shape,
+                sfa_order=sfa_order,
+                sfb_ptr=sfb_ptr,
+                sfb_shape=sfb_shape,
+                sfb_order=sfb_order,
+                sfd_row_ptr=sfd_row_ptr,
+                sfd_row_shape=sfd_row_shape,
+                sfd_row_order=sfd_row_order,
+                sfd_col_ptr=sfd_col_ptr,
+                sfd_col_shape=sfd_col_shape,
+                sfd_col_order=sfd_col_order,
+                amax_ptr=amax_ptr,
+                amax_shape=amax_shape,
+                amax_order=amax_order,
+                norm_const_ptr=norm_const_ptr,
+                norm_const_shape=norm_const_shape,
+                norm_const_order=norm_const_order,
+                tile_idx_to_expert_idx_ptr=tile_idx_ptr,
+                tile_idx_to_expert_idx_shape=tile_idx_shape,
+                tile_idx_to_expert_idx_order=tile_idx_order,
+                num_non_exiting_tiles_ptr=num_tiles_ptr,
+                num_non_exiting_tiles_shape=num_tiles_shape,
+                num_non_exiting_tiles_order=num_tiles_order,
+                m_split_cumsum_ptr=m_split_cumsum_ptr,
+                m_split_cumsum_shape=m_split_cumsum_shape,
+                m_split_cumsum_order=m_split_cumsum_order,
+                alpha_ptr=alpha_ptr,
+                alpha_shape=alpha_shape,
+                alpha_order=alpha_order,
+                prob_ptr=prob_ptr,
+                prob_shape=prob_shape,
+                prob_order=prob_order,
+                max_active_clusters=max_active_clusters,
+                stream=current_stream,
+            )
+        else:
+            raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+
+        self._logger.debug("Kernel compiled successfully")
+
+    def execute(
+        self,
+        a_tensor: torch.Tensor,
+        b_tensor: torch.Tensor,
+        c_tensor: torch.Tensor,
+        d_tensor: torch.Tensor,
+        sfa_tensor: torch.Tensor,
+        sfb_tensor: torch.Tensor,
+        tile_idx_to_expert_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        alpha_tensor: torch.Tensor,
+        d_col_tensor: Optional[torch.Tensor] = None,
+        sfd_row_tensor: Optional[torch.Tensor] = None,
+        sfd_col_tensor: Optional[torch.Tensor] = None,
+        amax_tensor: Optional[torch.Tensor] = None,
+        norm_const_tensor: Optional[torch.Tensor] = None,
+        prob_tensor: Optional[torch.Tensor] = None,
+        m_split_cumsum: Optional[torch.Tensor] = None,
+        current_stream: Optional[cuda.CUstream] = None,
+        skip_compile: bool = False,
+    ) -> None:
+        """Execute the compiled kernel.
+
+        :param a_tensor: Input A tensor
+        :param b_tensor: Input B tensor (weights)
+        :param c_tensor: Intermediate C tensor
+        :param d_tensor: Output D tensor
+        :param sfa_tensor: Scale factor A
+        :param sfb_tensor: Scale factor B
+        :param tile_idx_to_expert_idx: Tile to expert mapping
+        :param num_non_exiting_tiles: Number of valid tiles
+        :param alpha_tensor: Per-group scaling factors
+        :param d_col_tensor: Optional column-quantized output
+        :param sfd_row_tensor: Optional row scale factor D
+        :param sfd_col_tensor: Optional column scale factor D
+        :param amax_tensor: Optional amax tensor
+        :param norm_const_tensor: Optional normalization constant
+        :param prob_tensor: Optional probability tensor
+        :param m_split_cumsum: Optional m split cumulative sum tensor
+        :param current_stream: CUDA stream
+        :param skip_compile: If True, use JIT execution without prior compilation
+        """
+        self._logger.debug("Entering execute")
+        current_stream = self._get_default_stream(current_stream)
+
+        norm_const_tensor = self._unpad_tensor_to_ndim(norm_const_tensor, 1, "norm_const")
+
+        if not skip_compile:
+            self._runtime_error_if(
+                self._compiled_kernel is None,
+                "Kernel not compiled; call compile() first or use skip_compile=True",
+            )
+
+            if self._kernel is BlockScaledContiguousGroupedGemmKernel:
+                self._logger.debug("Executing grouped_gemm_swiglu kernel (dlpack)")
+                self._compiled_kernel(
+                    a=from_dlpack(a_tensor, assumed_align=16),
+                    b=from_dlpack(b_tensor, assumed_align=16),
+                    c=from_dlpack(c_tensor, assumed_align=16),
+                    d=from_dlpack(d_tensor, assumed_align=16),
+                    d_col=from_dlpack(d_col_tensor, assumed_align=16) if d_col_tensor is not None else None,
+                    sfa=from_dlpack(sfa_tensor, assumed_align=16),
+                    sfb=from_dlpack(sfb_tensor, assumed_align=16),
+                    sfd_row_tensor=from_dlpack(sfd_row_tensor, assumed_align=16) if sfd_row_tensor is not None else None,
+                    sfd_col_tensor=from_dlpack(sfd_col_tensor, assumed_align=16) if sfd_col_tensor is not None else None,
+                    amax_tensor=from_dlpack(amax_tensor, assumed_align=16) if amax_tensor is not None else None,
+                    norm_const_tensor=from_dlpack(norm_const_tensor, assumed_align=16) if norm_const_tensor is not None else None,
+                    tile_idx_to_expert_idx=from_dlpack(tile_idx_to_expert_idx, assumed_align=16),
+                    num_non_exiting_tiles=from_dlpack(num_non_exiting_tiles, assumed_align=16),
+                    m_split_cumsum=from_dlpack(m_split_cumsum, assumed_align=16) if m_split_cumsum is not None else None,
+                    alpha=from_dlpack(alpha_tensor, assumed_align=16),
+                    prob=from_dlpack(prob_tensor, assumed_align=16) if prob_tensor is not None else None,
+                    stream=current_stream,
+                )
+            elif self._kernel is BlockScaledContiguousGroupedGemmKernelNoDlpack:
+                self._logger.debug("Executing grouped_gemm_swiglu kernel (no_dlpack)")
+                # Create cute pointers manually to avoid DLPack requirements
+                a_ptr = self._make_cute_pointer(a_tensor, assumed_align=16)
+                b_ptr = self._make_cute_pointer(b_tensor, assumed_align=16)
+                c_ptr = self._make_cute_pointer(c_tensor, assumed_align=16)
+                d_ptr = self._make_cute_pointer(d_tensor, assumed_align=16)
+                d_col_ptr = self._make_cute_pointer(d_col_tensor, assumed_align=16)
+                sfa_ptr = self._make_cute_pointer(sfa_tensor, assumed_align=16)
+                sfb_ptr = self._make_cute_pointer(sfb_tensor, assumed_align=16)
+                sfd_row_ptr = self._make_cute_pointer(sfd_row_tensor, assumed_align=16)
+                sfd_col_ptr = self._make_cute_pointer(sfd_col_tensor, assumed_align=16)
+                amax_ptr = self._make_cute_pointer(amax_tensor, assumed_align=16)
+                norm_const_ptr = self._make_cute_pointer(norm_const_tensor, assumed_align=16)
+                tile_idx_ptr = self._make_cute_pointer(tile_idx_to_expert_idx, assumed_align=16)
+                num_tiles_ptr = self._make_cute_pointer(num_non_exiting_tiles, assumed_align=16)
+                m_split_cumsum_ptr = self._make_cute_pointer(m_split_cumsum, assumed_align=16)
+                alpha_ptr = self._make_cute_pointer(alpha_tensor, assumed_align=16)
+                prob_ptr = self._make_cute_pointer(prob_tensor, assumed_align=16)
+
+                self._compiled_kernel(
+                    a_ptr=a_ptr,
+                    b_ptr=b_ptr,
+                    c_ptr=c_ptr,
+                    d_ptr=d_ptr,
+                    d_col_ptr=d_col_ptr,
+                    sfa_ptr=sfa_ptr,
+                    sfb_ptr=sfb_ptr,
+                    sfd_row_ptr=sfd_row_ptr,
+                    sfd_col_ptr=sfd_col_ptr,
+                    amax_ptr=amax_ptr,
+                    norm_const_ptr=norm_const_ptr,
+                    tile_idx_to_expert_idx_ptr=tile_idx_ptr,
+                    num_non_exiting_tiles_ptr=num_tiles_ptr,
+                    m_split_cumsum_ptr=m_split_cumsum_ptr,
+                    alpha_ptr=alpha_ptr,
+                    prob_ptr=prob_ptr,
+                    stream=current_stream,
+                )
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+        else:
+            self._logger.debug("Executing without compiled kernel (JIT)")
+            generate_sfd = sfd_row_tensor is not None and sfd_col_tensor is not None and norm_const_tensor is not None
+            discrete_col_sfd = self.discrete_col_sfd and generate_sfd
+
+            gemm_swiglu = self._kernel(
+                sf_vec_size=self.sf_vec_size,
+                acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+                use_2cta_instrs=self.use_2cta_instrs,
+                mma_tiler_mn=self.mma_tiler_mn,
+                cluster_shape_mn=self.cluster_shape_mn,
+                vector_f32=self.vector_f32,
+                generate_sfd=generate_sfd,
+                discrete_col_sfd=discrete_col_sfd,
+            )
+
+            hardware_info = cutlass.utils.HardwareInfo()
+            max_active_clusters = hardware_info.get_max_active_clusters(self.cluster_shape_mn[0] * self.cluster_shape_mn[1])
+
+            if self._kernel is BlockScaledContiguousGroupedGemmKernel:
+                self._logger.debug("JIT executing grouped_gemm_swiglu kernel (dlpack)")
+                gemm_swiglu(
+                    a=from_dlpack(a_tensor, assumed_align=16),
+                    b=from_dlpack(b_tensor, assumed_align=16),
+                    c=from_dlpack(c_tensor, assumed_align=16),
+                    d=from_dlpack(d_tensor, assumed_align=16),
+                    d_col=from_dlpack(d_col_tensor, assumed_align=16) if d_col_tensor is not None else None,
+                    sfa=from_dlpack(sfa_tensor, assumed_align=16),
+                    sfb=from_dlpack(sfb_tensor, assumed_align=16),
+                    sfd_row_tensor=from_dlpack(sfd_row_tensor, assumed_align=16) if sfd_row_tensor is not None else None,
+                    sfd_col_tensor=from_dlpack(sfd_col_tensor, assumed_align=16) if sfd_col_tensor is not None else None,
+                    amax_tensor=from_dlpack(amax_tensor, assumed_align=16) if amax_tensor is not None else None,
+                    norm_const_tensor=from_dlpack(norm_const_tensor) if norm_const_tensor is not None else None,
+                    tile_idx_to_expert_idx=from_dlpack(tile_idx_to_expert_idx, assumed_align=16),
+                    num_non_exiting_tiles=from_dlpack(num_non_exiting_tiles, assumed_align=16),
+                    m_split_cumsum=from_dlpack(m_split_cumsum, assumed_align=16) if self.m_split_cumsum is not None else None,
+                    alpha=from_dlpack(alpha_tensor, assumed_align=16),
+                    prob=from_dlpack(prob_tensor, assumed_align=16) if prob_tensor is not None else None,
+                    max_active_clusters=max_active_clusters,
+                    stream=current_stream,
+                )
+            elif self._kernel is BlockScaledContiguousGroupedGemmKernelNoDlpack:
+                self._logger.debug("JIT executing grouped_gemm_swiglu kernel (no_dlpack)")
+                # Create cute tensor descriptors manually to avoid DLPack requirements
+                a_ptr, a_shape, a_order = self._make_cute_tensor_descriptor(a_tensor, name="A")
+                b_ptr, b_shape, b_order = self._make_cute_tensor_descriptor(b_tensor, name="B")
+                c_ptr, c_shape, c_order = self._make_cute_tensor_descriptor(c_tensor, name="C")
+                d_ptr, d_shape, d_order = self._make_cute_tensor_descriptor(d_tensor, name="D")
+                d_col_ptr, d_col_shape, d_col_order = self._make_cute_tensor_descriptor(d_col_tensor, name="D_col")
+                sfa_ptr, sfa_shape, sfa_order = self._make_cute_tensor_descriptor(sfa_tensor, name="SFA")
+                sfb_ptr, sfb_shape, sfb_order = self._make_cute_tensor_descriptor(sfb_tensor, name="SFB")
+                sfd_row_ptr, sfd_row_shape, sfd_row_order = self._make_cute_tensor_descriptor(sfd_row_tensor, name="SFD_row")
+                sfd_col_ptr, sfd_col_shape, sfd_col_order = self._make_cute_tensor_descriptor(sfd_col_tensor, name="SFD_col")
+                amax_ptr, amax_shape, amax_order = self._make_cute_tensor_descriptor(amax_tensor, name="amax")
+                norm_const_ptr, norm_const_shape, norm_const_order = self._make_cute_tensor_descriptor(norm_const_tensor, name="norm_const")
+                tile_idx_ptr, tile_idx_shape, tile_idx_order = self._make_cute_tensor_descriptor(tile_idx_to_expert_idx, name="tile_idx")
+                num_tiles_ptr, num_tiles_shape, num_tiles_order = self._make_cute_tensor_descriptor(num_non_exiting_tiles, name="num_tiles")
+                alpha_ptr, alpha_shape, alpha_order = self._make_cute_tensor_descriptor(alpha_tensor, name="alpha")
+                prob_ptr, prob_shape, prob_order = self._make_cute_tensor_descriptor(prob_tensor, name="prob")
+                m_split_cumsum_ptr, m_split_cumsum_shape, m_split_cumsum_order = self._make_cute_tensor_descriptor(m_split_cumsum, name="m_split_cumsum")
+
+                gemm_swiglu(
+                    a_ptr=a_ptr,
+                    a_shape=a_shape,
+                    a_order=a_order,
+                    b_ptr=b_ptr,
+                    b_shape=b_shape,
+                    b_order=b_order,
+                    c_ptr=c_ptr,
+                    c_shape=c_shape,
+                    c_order=c_order,
+                    d_ptr=d_ptr,
+                    d_shape=d_shape,
+                    d_order=d_order,
+                    d_col_ptr=d_col_ptr,
+                    d_col_shape=d_col_shape,
+                    d_col_order=d_col_order,
+                    sfa_ptr=sfa_ptr,
+                    sfa_shape=sfa_shape,
+                    sfa_order=sfa_order,
+                    sfb_ptr=sfb_ptr,
+                    sfb_shape=sfb_shape,
+                    sfb_order=sfb_order,
+                    sfd_row_ptr=sfd_row_ptr,
+                    sfd_row_shape=sfd_row_shape,
+                    sfd_row_order=sfd_row_order,
+                    sfd_col_ptr=sfd_col_ptr,
+                    sfd_col_shape=sfd_col_shape,
+                    sfd_col_order=sfd_col_order,
+                    amax_ptr=amax_ptr,
+                    amax_shape=amax_shape,
+                    amax_order=amax_order,
+                    norm_const_ptr=norm_const_ptr,
+                    norm_const_shape=norm_const_shape,
+                    norm_const_order=norm_const_order,
+                    tile_idx_to_expert_idx_ptr=tile_idx_ptr,
+                    tile_idx_to_expert_idx_shape=tile_idx_shape,
+                    tile_idx_to_expert_idx_order=tile_idx_order,
+                    num_non_exiting_tiles_ptr=num_tiles_ptr,
+                    num_non_exiting_tiles_shape=num_tiles_shape,
+                    num_non_exiting_tiles_order=num_tiles_order,
+                    m_split_cumsum_ptr=m_split_cumsum_ptr,
+                    m_split_cumsum_shape=m_split_cumsum_shape,
+                    m_split_cumsum_order=m_split_cumsum_order,
+                    alpha_ptr=alpha_ptr,
+                    alpha_shape=alpha_shape,
+                    alpha_order=alpha_order,
+                    prob_ptr=prob_ptr,
+                    prob_shape=prob_shape,
+                    prob_order=prob_order,
+                    max_active_clusters=max_active_clusters,
+                    stream=current_stream,
+                )
+            else:
+                raise NotImplementedError(f"Unreachable: invalid kernel type {self._kernel}")
+
+        self._logger.debug("Execute completed")
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_GroupedGemmSwigluSm100Objects = {}
+
+
+def grouped_gemm_swiglu_wrapper_sm100(
+    a_tensor: torch.Tensor,
+    b_tensor: torch.Tensor,
+    sfa_tensor: torch.Tensor,
+    sfb_tensor: torch.Tensor,
+    tile_idx_to_expert_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    alpha_tensor: torch.Tensor,
+    norm_const_tensor: Optional[torch.Tensor] = None,
+    prob_tensor: Optional[torch.Tensor] = None,
+    m_split_cumsum: Optional[torch.Tensor] = None,
+    acc_dtype: torch.dtype = torch.float32,
+    c_dtype: torch.dtype = torch.bfloat16,
+    d_dtype: torch.dtype = torch.bfloat16,
+    cd_major: str = "n",
+    mma_tiler_mn: Tuple[int, int] = (256, 256),
+    cluster_shape_mn: Optional[Tuple[int, int]] = None,
+    sf_vec_size: int = 16,
+    vector_f32: bool = False,
+    m_aligned: int = 256,
+    discrete_col_sfd: bool = False,
+    current_stream: Optional[cuda.CUstream] = None,
+) -> TupleDict:
+    """Convenience wrapper for grouped GEMM SwiGLU forward operation.
+
+    This function creates the API, compiles, and executes in one call.
+    Compiled kernels are cached for reuse when called with the same configuration.
+
+    Args:
+        a_tensor: Input A tensor (valid_m, k, 1)
+        b_tensor: Weight B tensor (n, k, l)
+        sfa_tensor: Scale factor A
+        sfb_tensor: Scale factor B
+        tile_idx_to_expert_idx: Tile to expert mapping
+        num_non_exiting_tiles: Number of valid tiles
+        alpha_tensor: Per-group scaling
+        norm_const_tensor: Optional normalization constant. Required when using FP8
+            input configurations (i.e., when a_tensor.dtype is FP8 and sfa_tensor.dtype is FP8).
+            Should be None for FP4/BF16 input configurations.
+        prob_tensor: Optional probability tensor for gating
+        m_split_cumsum: Optional m split cumulative sum tensor. Required when discrete_col_sfd is True.
+        acc_dtype: Accumulator data type
+        c_dtype: Intermediate C tensor data type (always bfloat16)
+        d_dtype: Output D tensor data type (fp8 when ab is fp8, bf16 when ab is fp4)
+        cd_major: CD major dimension (note: only "n"-major layout is supported)
+        mma_tiler_mn: MMA tiler shape
+        cluster_shape_mn: Cluster shape
+        sf_vec_size: Scale factor vector size
+        vector_f32: Use vectorized f32
+        m_aligned: M alignment
+        discrete_col_sfd: Boolean, True to generate discrete col-major scale factor tensor. Only applies when already output scale factor tensors are provided.
+        current_stream: CUDA stream
+
+    Returns:
+        TupleDict: A dictionary-like object containing output tensors that can also be unpacked as a tuple.
+            Dictionary keys (also the unpacking order):
+            - **c_tensor** (torch.Tensor): Intermediate result tensor
+            - **d_tensor** (torch.Tensor): Final output tensor after SwiGLU
+            - **d_col_tensor** (torch.Tensor): Column-wise output tensor
+            - **amax_tensor** (torch.Tensor or None): Absolute maximum values (for quantization)
+            - **sfd_row_tensor** (torch.Tensor or None): Row-wise scale factors for D (FP8 only)
+            - **sfd_col_tensor** (torch.Tensor or None): Column-wise scale factors for D (FP8 only)
+
+            Example usage::
+
+                # Dictionary-style access
+                result = grouped_gemm_swiglu_wrapper_sm100(...)
+                c = result["c_tensor"]
+                d = result["d_tensor"]
+
+                # Tuple unpacking
+                c, d, d_col, amax, sfd_row, sfd_col = grouped_gemm_swiglu_wrapper_sm100(...)
+
+                # Integer indexing
+                c = result[0]  # c_tensor
+    """
+    valid_m, k, _ = a_tensor.shape
+    n, _, l = b_tensor.shape
+    n_out = n // 2  # After SwiGLU
+
+    _logger.debug(f"grouped_gemm_swiglu_wrapper_sm100: Creating output tensors c_tensor, d_tensor, d_col_tensor")
+
+    if cd_major == "n":
+        # 1, m, n, permute (1, 2, 0) -> (m, n, 1)
+        c_tensor = torch.empty_strided((valid_m, n, 1), (n, 1, valid_m * n), dtype=c_dtype, device=a_tensor.device)
+        d_tensor = torch.empty_strided((valid_m, n_out, 1), (n_out, 1, valid_m * n_out), dtype=d_dtype, device=a_tensor.device)
+        d_col_tensor = torch.empty_strided((valid_m, n_out, 1), (n_out, 1, valid_m * n_out), dtype=d_dtype, device=a_tensor.device)
+    else:
+        raise ValueError(f"cd_major must be 'n', got {cd_major}")
+
+    sfd_row_tensor = None
+    sfd_col_tensor = None
+    amax_tensor = None
+
+    if a_tensor.dtype in [torch.float8_e4m3fn, torch.float8_e5m2] and sfa_tensor.dtype in [torch.float8_e8m0fnu, torch.float8_e4m3fn]:
+        _logger.debug("grouped_gemm_swiglu_wrapper_sm100: Detected fp8 a_dtype and sfa_dtype, constructing sfd_row_tensor and sfd_col_tensor")
+
+        sf_dtype = sfa_tensor.dtype
+        mma_permute_order = (3, 4, 1, 5, 2, 0)
+
+        # sfd_row: l=1, mn=valid_m, k=n_out
+        sf_k_row = ceil_div(n_out, sf_vec_size)
+        mma_shape_row = (
+            1,
+            ceil_div(valid_m, 128),
+            ceil_div(sf_k_row, 4),
+            32,
+            4,
+            4,
+        )
+        sfd_row_tensor = torch.empty(mma_shape_row, dtype=sf_dtype, device=a_tensor.device).permute(mma_permute_order)
+
+        # sfd_col: l=1, mn=n_out, k=valid_m
+        sf_k_col = ceil_div(valid_m, sf_vec_size)
+        mma_shape_col = (
+            1,
+            ceil_div(n_out, 128),
+            ceil_div(sf_k_col, 4),
+            32,
+            4,
+            4,
+        )
+        sfd_col_tensor = torch.empty(mma_shape_col, dtype=sf_dtype, device=a_tensor.device).permute(mma_permute_order)
+
+    if d_dtype in [torch.bfloat16, torch.float16]:
+        _logger.debug("grouped_gemm_swiglu_wrapper_sm100: Detected bf16/float16 d_dtype, constructing amax_tensor")
+        amax_tensor = torch.full((l, 1), float("-inf"), dtype=torch.float32, device=a_tensor.device)
+
+    cache_key = (
+        a_tensor.shape,
+        b_tensor.shape,
+        a_tensor.dtype,
+        b_tensor.dtype,
+        a_tensor.stride(),
+        b_tensor.stride(),
+        sfa_tensor.shape,
+        sfb_tensor.shape,
+        sfa_tensor.stride(),
+        sfb_tensor.stride(),
+        sfa_tensor.dtype,
+        sfb_tensor.dtype,
+        norm_const_tensor.shape if norm_const_tensor is not None else None,
+        norm_const_tensor.stride() if norm_const_tensor is not None else None,
+        norm_const_tensor.dtype if norm_const_tensor is not None else None,
+        m_split_cumsum.shape if m_split_cumsum is not None else None,
+        m_split_cumsum.stride() if m_split_cumsum is not None else None,
+        m_split_cumsum.dtype if m_split_cumsum is not None else None,
+        acc_dtype,
+        c_dtype,
+        d_dtype,
+        cd_major,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sf_vec_size,
+        vector_f32,
+        m_aligned,
+        discrete_col_sfd,
+    )
+
+    if cache_key in _cache_of_GroupedGemmSwigluSm100Objects:
+        _logger.debug("group_gemm_swiglu_wrapper_sm100: Using previously cached GroupedGemmSwigluSm100 object")
+        grouped_gemm_swiglu = _cache_of_GroupedGemmSwigluSm100Objects[cache_key]
+        grouped_gemm_swiglu.execute(
+            a_tensor=a_tensor,
+            b_tensor=b_tensor,
+            c_tensor=c_tensor,
+            d_tensor=d_tensor,
+            sfa_tensor=sfa_tensor,
+            sfb_tensor=sfb_tensor,
+            tile_idx_to_expert_idx=tile_idx_to_expert_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            alpha_tensor=alpha_tensor,
+            d_col_tensor=d_col_tensor,
+            sfd_row_tensor=sfd_row_tensor,
+            sfd_col_tensor=sfd_col_tensor,
+            amax_tensor=amax_tensor,
+            norm_const_tensor=norm_const_tensor,
+            prob_tensor=prob_tensor,
+            m_split_cumsum=m_split_cumsum,
+            current_stream=current_stream,
+        )
+    else:
+        _logger.debug("group_gemm_swiglu_wrapper_sm100: No previously cached GroupedGemmSwigluSm100 object found, creating new GroupedGemmSwigluSm100 object")
+        grouped_gemm_swiglu = GroupedGemmSwigluSm100(
+            sample_a=a_tensor,
+            sample_b=b_tensor,
+            sample_c=c_tensor,
+            sample_d=d_tensor,
+            sample_sfa=sfa_tensor,
+            sample_sfb=sfb_tensor,
+            sample_tile_idx_to_expert_idx=tile_idx_to_expert_idx,
+            sample_num_non_exiting_tiles=num_non_exiting_tiles,
+            sample_alpha=alpha_tensor,
+            sample_amax=amax_tensor,
+            sample_d_col=d_col_tensor,
+            sample_sfd_row=sfd_row_tensor,
+            sample_sfd_col=sfd_col_tensor,
+            sample_norm_const=norm_const_tensor,
+            sample_prob=prob_tensor,
+            sample_m_split_cumsum=m_split_cumsum,
+            acc_dtype=acc_dtype,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            sf_vec_size=sf_vec_size,
+            vector_f32=vector_f32,
+            m_aligned=m_aligned,
+            discrete_col_sfd=discrete_col_sfd,
+        )
+
+        assert grouped_gemm_swiglu.check_support(), "Unsupported configuration"
+        grouped_gemm_swiglu.compile(current_stream=current_stream)
+        grouped_gemm_swiglu.execute(
+            a_tensor=a_tensor,
+            b_tensor=b_tensor,
+            c_tensor=c_tensor,
+            d_tensor=d_tensor,
+            sfa_tensor=sfa_tensor,
+            sfb_tensor=sfb_tensor,
+            tile_idx_to_expert_idx=tile_idx_to_expert_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            alpha_tensor=alpha_tensor,
+            d_col_tensor=d_col_tensor,
+            sfd_row_tensor=sfd_row_tensor,
+            sfd_col_tensor=sfd_col_tensor,
+            amax_tensor=amax_tensor,
+            norm_const_tensor=norm_const_tensor,
+            prob_tensor=prob_tensor,
+            m_split_cumsum=m_split_cumsum,
+            current_stream=current_stream,
+        )
+        _cache_of_GroupedGemmSwigluSm100Objects[cache_key] = grouped_gemm_swiglu
+
+    return TupleDict(
+        c_tensor=c_tensor,
+        d_tensor=d_tensor,
+        d_col_tensor=d_col_tensor,
+        amax_tensor=amax_tensor,
+        sfd_row_tensor=sfd_row_tensor,
+        sfd_col_tensor=sfd_col_tensor,
+    )
diff --git a/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/grouped_gemm_swiglu_quant.py b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/grouped_gemm_swiglu_quant.py
new file mode 100644
index 00000000..b758fe9c
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/grouped_gemm_swiglu/grouped_gemm_swiglu_quant.py
@@ -0,0 +1,3070 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+Grouped GEMM SwiGLU Forward Kernel (SM100+)
+
+High-performance contiguous grouped block-scaled GEMM with SwiGLU activation
+for MoE (Mixture of Experts) workloads on NVIDIA Blackwell GPUs.
+"""
+
+from typing import Type, Tuple, Union, Optional
+
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import cpasync, tcgen05
+from cutlass.cute.nvgpu.tcgen05 import OperandMajorMode
+from cutlass.cutlass_dsl import T
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.runtime import from_dlpack
+from cutlass._mlir.dialects.nvvm import ReduxKind
+from cutlass.cute.typing import Float32, Int32, Numeric
+from cutlass.cutlass_dsl import T, dsl_user_op, if_generate
+from cutlass._mlir.dialects import math, nvvm, llvm, scf
+from cutlass._mlir.dialects.nvvm import FPRoundingMode
+
+from cudnn.api_base import ceil_div
+from ..utils import (
+    PersistentTileSchedulerParams,
+    fmin,
+    warp_redux_sync,
+    atomic_max_float32,
+    sigmoid_f32,
+    silu_f32,
+)
+
+"""
+High-performance persistent blockscaled contiguous grouped dense GEMM (D = alpha * (SFA * A) * (SFB * B)) example for the NVIDIA Blackwell architecture
+using CUTE DSL.
+- Matrix A is MxKx1, A can be row-major("K"), ValidM is composed of valid m in different groups
+- Matrix B is NxKxL, B can be column-major("K"), L is grouped dimension
+- Matrix D is MxNx1, D can be row-major("N"), ValidM is composed of valid m in different groups
+- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, which has M×ceil_div(K, sf_vec_size)×L elements respectively
+- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, which has N×ceil_div(K, sf_vec_size)×L elements respectively
+
+Matrix A/D Memory Layout Diagrams:
+
+   ```
+    Group 0    Group 1   Group 2
+   -+---------+---------+---------+
+    |         |         |         |
+   K| ValidM0 | ValidM1 | ValidM2 |
+    |         |         |         |
+   -+---------+---------+---------+
+    |<-        ValidM           ->|
+   ```
+   Note: the Group(L) dimension will be flatted into M dimension, and the rest Group(L) size is 1.
+         each ValidM will be aligned to 256 or 128. The alignment is determined by the mma_tiler_mn parameter.
+         For NVFP4, 2CTA, the alignment is 256. For NVFP4, 1CTA, the alignment is 128. 
+
+This GEMM kernel supports the following features:
+    - Utilizes Tensor Memory Access (TMA) for efficient memory operations
+    - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations
+    - Implements TMA multicast with cluster to reduce L2 memory traffic
+    - Support persistent tile scheduling to better overlap memory load/store with mma between tiles
+    - Support warp specialization to avoid explicit pipelining between mainloop load and mma
+
+This GEMM works as follows:
+1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
+2. SCALE warp: Load scaleA and scaleB matrices from global memory (GMEM) to shared memory (SMEM) using non-TMA operations.
+2. MMA warp: 
+    - Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
+    - Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
+3. EPILOGUE warp:
+    - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
+    - Apply alpha and update the final accumulator Final = alpha * acc
+    - Type convert Final matrix to output type.
+    - Store D matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations.
+
+SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
+- Read matrix A from SMEM
+- Read matrix B from SMEM
+- Read scalefactor A from TMEM
+- Read scalefactor B from TMEM
+- Write accumulator to TMEM
+The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
+
+.. code-block:: bash
+
+    python examples/blackwell/contiguous_blockscaled_grouped_gemm.py         \
+      --ab_dtype Float4E2M1FN --d_dtype BFloat16 --acc_dtype Float32         \
+      --sf_dtype Float8E4M3FN --sf_vec_size 16                               \
+      --mma_tiler_mn 256,128 --cluster_shape_mn 2,1                          \
+      --mnkl 256,4096,7168,1 --use_2cta_instrs --m_aligned 256
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/contiguous_blockscaled_grouped_gemm.py     \
+      --ab_dtype Float8E4M3FN --sf_dtype Float8E8M0FNU --c_dtype BFloat16    \
+      --d_dtype Float8E4M3FN --sf_vec_size 32 --mma_tiler_mn 256,256         \
+      --cluster_shape_mn 2,1 --nkl 4096,7168,8 --use_2cta_instrs             \
+      --m_aligned 256 --fixed_m 4096
+
+Constraints:
+* Supported input data types: mxf8, nvf4
+  see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
+* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
+* Mma tiler M must be 128 or 256(use_2cta_instrs)
+* Mma tiler N must be 64/128/192/256
+* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
+* The contiguous dimension of A/B/D tensors must be at least 16 bytes aligned,
+  i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
+
+CUDA Graph Support:
+* For CUDA graph support, the tile_idx_to_expert_idx, A/D matrices, and scale factor A can be padded to a larger size
+  (e.g., permuted_m = m*topK + num_local_experts*(256-1), example: 4096*8 + (256/32)*255 = 34808)
+* Use create_tensors() with permuted_m parameter to automatically pad:
+  - tile_idx_to_expert_idx: padded for invalid tiles
+  - A matrix: padded to permuted_m rows (padding rows contain dummy data)
+  - D matrix: padded to permuted_m rows (output buffer for cuda_graph)
+  - Scale factor A: padded to match A matrix dimensions
+* Kernel handling of padding (similar to masked_grouped_gemm.py):
+  - Scheduler warp checks if tile_idx >= num_non_exiting_tiles to exit
+  - Only valid tiles (tile_idx < num_non_exiting_tiles) are written to tile_info pipeline
+  - When no more valid tiles exist, outer loop exits and calls producer_tail()
+  - Consumer warps process only valid tiles from pipeline
+  - No deadlock or synchronization issues
+* Consumer warps check initial tile against num_non_exiting_tiles and set is_valid_tile=False if tile_idx >= num_non_exiting_tiles
+* Only rows within (aligned_groupm[0]+aligned_groupm[1]+...) contain valid data
+* Padding rows in D matrix will not be written by the kernel
+"""
+
+
+class BlockScaledContiguousGroupedGemmKernel:
+    """This class implements batched matrix multiplication (D = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported D data types:
+        - BFloat16
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        vector_f32: bool,
+        generate_sfd: bool,
+        discrete_col_sfd: bool,
+    ):
+        """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
+              with cta_group=2 should be used.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        :param vector_f32: Boolean, True to use vectorized f32 operations.
+        :type vector_f32: bool
+        :param generate_sfd: Boolean, True to generate output scale factor tensor
+        :type generate_sfd: bool
+        :param discrete_col_sfd: Boolean, True to generate discrete col-major scale factor tensor
+        :type discrete_col_sfd: bool
+        """
+
+        self.sf_vec_size = sf_vec_size
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        self.epilog_warp_id = (0, 1, 2, 3)
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.sched_warp_id = 6
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+                self.sched_warp_id,
+            )
+        )
+        self.threads_wo_sched = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+            )
+        )
+        # TODO: Do we need to reallocate register?
+        # self.num_regs_uniform_warps = 64
+        # self.num_regs_sched_warps = 64
+        # self.num_regs_epilogue_warps = 216
+
+        # Set barrier for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.sched_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=4,
+            num_threads=self.threads_per_warp,
+        )
+        self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.vector_f32 = vector_f32
+
+        self.generate_sfd = generate_sfd
+        self.discrete_col_sfd = discrete_col_sfd
+
+        # Amax reduction configuration
+        self.num_epilog_warps = len(self.epilog_warp_id)
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/D stage counts in shared memory
+        - Computing A/B/D shared memory layout
+        - Computing tensor memory allocation columns
+        """
+
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+        self.cta_tile_shape_mnk_sfb = (
+            self.mma_tiler_sfb[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_sfb[1],
+            self.mma_tiler_sfb[2],
+        )
+
+        self.mma_tiler_d = (
+            self.mma_inst_shape_mn[0],
+            self.mma_inst_shape_mn[1] // 2,
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+        self.cta_tile_shape_mnk_d = (
+            self.mma_tiler_d[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler_d[1],
+            self.mma_tiler_d[2],
+        )
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Set epilogue subtile
+        self.epi_tile = (128, 32)
+        self.epi_tile_cnt = (
+            self.cta_tile_shape_mnk_d[0] // self.epi_tile[0],
+            self.cta_tile_shape_mnk_d[1] // self.epi_tile[1],
+        )
+        self.epi_tile_c = (128, 64)
+
+        # Setup A/B/D/Scale stage count in shared memory and ACC stage count in tensor memory
+        (
+            self.num_acc_stage,
+            self.num_ab_stage,
+            self.num_c_stage,
+            self.num_d_stage,
+            self.num_tile_stage,
+        ) = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.epi_tile_c,
+            self.c_dtype,
+            self.c_layout,
+            self.d_dtype,
+            self.d_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.num_smem_capacity,
+            self.occupancy,
+            self.generate_sfd,
+        )
+
+        # Compute A/B/D/Scale shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile_c,
+            self.num_c_stage,
+        )
+
+        self.d_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.d_dtype,
+            self.d_layout,
+            self.epi_tile,
+            self.num_d_stage,
+        )
+
+        # Overlap and double buffer accumulator when num_acc_stage == 1 for cta_tile_n = 256 case
+        self.overlapping_accum = self.num_acc_stage == 1 and self.mma_tiler[1] == 256
+
+        # Compute number of TMEM columns for SFA/SFB/Accumulator
+        sf_atom_mn = 32
+        self.num_sfa_tmem_cols = (self.cta_tile_shape_mnk[0] // sf_atom_mn) * mma_inst_tile_k
+        self.num_sfb_tmem_cols = (self.cta_tile_shape_mnk_sfb[1] // sf_atom_mn) * mma_inst_tile_k
+        self.num_sf_tmem_cols = self.num_sfa_tmem_cols + self.num_sfb_tmem_cols
+        self.num_accumulator_tmem_cols = (
+            self.cta_tile_shape_mnk[1] * self.num_acc_stage if not self.overlapping_accum else self.cta_tile_shape_mnk[1] * 2 - self.num_sf_tmem_cols
+        )
+
+        self.epi_tile_n_required = 2 * cute.size(self.epi_tile[1])
+        # Only when overlapping_accum is enabled, we need to release accumulator buffer early in epilogue
+        self.iter_acc_early_release_in_epilogue = ((self.num_sf_tmem_cols + self.epi_tile_n_required - 1) // self.epi_tile_n_required - 1) * 2
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        d: cute.Tensor,
+        d_col: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        sfd_row_tensor: Optional[cute.Tensor],
+        sfd_col_tensor: Optional[cute.Tensor],
+        amax_tensor: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        tile_idx_to_expert_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        m_split_cumsum: Optional[cute.Tensor],
+        alpha: cute.Tensor,
+        prob: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param c: Output tensor C
+        :type c: cute.Tensor
+        :param d: Output tensor D
+        :type d: cute.Tensor
+        :param d_col: Output tensor D column quantized
+        :type d_col: cute.Tensor
+        :param sfa: Scale factor tensor A
+        :type sfa: cute.Tensor
+        :param sfb: Scale factor tensor B
+        :type sfb: cute.Tensor
+        :param sfd_row_tensor: Scale factor tensor D
+        :type sfd_row_tensor: Optional[cute.Tensor]
+        :param sfd_col_tensor: Scale factor tensor D
+        :type sfd_col_tensor: Optional[cute.Tensor]
+        :param amax_tensor: Absolute maximum value tensor
+        :type amax_tensor: Optional[cute.Tensor]
+        :param norm_const_tensor: Norm constant tensor
+        :type norm_const_tensor: Optional[cute.Tensor]
+        :param tile_idx_to_expert_idx: Mapping from tile index to expert ID, shape (permuted_m/cta_tile_m,) where cta_tile_m is the CTA tile M size
+        :type tile_idx_to_expert_idx: cute.Tensor
+        :param num_non_exiting_tiles: Number of valid tiles (valid_m/cta_tile_m), shape (1,)
+        :type num_non_exiting_tiles: cute.Tensor
+        :param alpha: Alpha tensor for each group
+        :type alpha: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.d_dtype: Type[cutlass.Numeric] = d.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+        self.d_layout = utils.LayoutEnum.from_tensor(d)
+
+        # Compute grid size
+        m, n, l = cute.shape(d)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a.shape, self.sf_vec_size)
+        sfa = cute.make_tensor(sfa.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size)
+        sfb = cute.make_tensor(sfb.iterator, sfb_layout)
+
+        # Setup sfd tensor by filling D tensor to scale factor atom layout
+        self.generate_sfd = sfd_row_tensor is not None and norm_const_tensor is not None
+        self.discrete_col_sfd = m_split_cumsum != None and self.discrete_col_sfd
+        if cutlass.const_expr(self.generate_sfd == False):
+            self.discrete_col_sfd = False
+        if cutlass.const_expr(self.generate_sfd):
+            sfd_row_layout = blockscaled_utils.tile_atom_to_shape_SF(d.shape, self.sf_vec_size)
+            sfd_row_tensor = cute.make_tensor(sfd_row_tensor.iterator, sfd_row_layout)
+            sfd_col_layout = cute.tile_to_shape(
+                blockscaled_utils.BlockScaledBasicChunk(self.sf_vec_size, OperandMajorMode.MN).layout,
+                d.shape,
+                (1, 2, 3),
+            )
+            if cutlass.const_expr(self.discrete_col_sfd):
+                sfd_col_layout = sfd_row_layout
+            sfd_col_tensor = cute.make_tensor(sfd_col_tensor.iterator, sfd_col_layout)
+
+        self.generate_amax = amax_tensor is not None
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        # For 2CTA blockscaled kernels, SFB needs to be replicated across peer CTAs. # {$nv-internal-release}
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # {$nv-internal-release begin}
+        # This modifies the layout to handle overlapping 256x(# of scale factors for a single column of B (nNSF))
+        # logical blocks for SFB when cta_tile_shape_n=192.
+        # {$nv-internal-release end}
+        if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout)
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size) * atom_thr_size
+
+        # Setup TMA store for C
+        c_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c,
+            c_smem_layout,
+            self.epi_tile_c,
+        )
+
+        # Setup TMA store for D
+        d_smem_layout = cute.slice_(self.d_smem_layout_staged, (None, None, 0))
+        tma_atom_d, tma_tensor_d = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            d,
+            d_smem_layout,
+            self.epi_tile,
+        )
+        tma_atom_d_col, tma_tensor_d_col = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            d_col,
+            d_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        output_shape = (m, n, l)
+        self.tile_sched_params, grid = self._compute_grid(
+            output_shape,
+            self.cta_tile_shape_mnk_d,
+            self.cluster_shape_mn,
+            max_active_clusters,
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorageFP8:
+            ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_tile_stage * 2]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            sD: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.d_dtype,
+                    cute.cosize(self.d_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            sD_col: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.d_dtype,
+                    cute.cosize(self.d_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # Amax reduction shared memory (one FP32 per epilogue warp)
+            # Use smaller alignment for amax since it's only 16 bytes
+            sAmax: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, self.num_epilog_warps],
+                1,  # byte alignment
+            ]
+            # (bidx, bidy, bidz, valid)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 6 * self.num_tile_stage],
+                1,  # byte alignment
+            ]
+
+        @cute.struct
+        class SharedStorageFP4:
+            ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_tile_stage * 2]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            sD: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.d_dtype,
+                    cute.cosize(self.d_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # Amax reduction shared memory (one FP32 per epilogue warp)
+            # Use smaller alignment for amax since it's only 16 bytes
+            sAmax: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, self.num_epilog_warps],
+                1,  # byte alignment
+            ]
+            # (bidx, bidy, bidz, valid)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 6 * self.num_tile_stage],
+                1,  # byte alignment
+            ]
+
+        if cutlass.const_expr(self.generate_sfd):
+            self.shared_storage = SharedStorageFP8
+        else:
+            self.shared_storage = SharedStorageFP4
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            tma_atom_d,
+            tma_tensor_d,
+            tma_atom_d_col,
+            tma_tensor_d_col,
+            sfd_row_tensor,
+            sfd_col_tensor,
+            norm_const_tensor,
+            amax_tensor,
+            tile_idx_to_expert_idx,
+            num_non_exiting_tiles,
+            m_split_cumsum,
+            alpha,
+            prob,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.d_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            max_number_threads=[self.threads_per_cta, 1, 1],
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+        return
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    @cute.jit
+    def amax_reduction_per_thread(self, vec_fp32, amax_fp32) -> None:
+        vec_fp32_ssa = vec_fp32.load()
+        abs_acc_values_ir = cutlass._mlir.dialects.math.absf(vec_fp32_ssa.ir_value())
+        abs_acc_values = type(vec_fp32_ssa)(abs_acc_values_ir, vec_fp32_ssa.shape, vec_fp32_ssa.dtype)
+        subtile_amax = abs_acc_values.reduce(cute.ReductionOp.MAX, cutlass.Float32(0.0), 0)
+        return cute.arch.fmax(amax_fp32, subtile_amax)
+
+    @cute.jit
+    def amax_reduction_per_warp_and_cta(self, amax_fp32, warp_idx, amax_smem, amax_gmem) -> None:
+        # Warp-level reduction using wrapper function
+        warp_amax = warp_redux_sync(
+            value=amax_fp32,
+            kind=ReduxKind.MAX,
+            mask_and_clamp=0xFFFFFFFF,
+            nan=True,
+        )
+        # Each epilogue warp's lane 0 writes warp amax to shared memory
+        if cute.arch.lane_idx() == 0:
+            amax_smem[warp_idx] = cutlass.Float32(warp_amax)
+
+        # Ensure all epilogue warps complete their writes before block reduction
+        self.epilog_sync_barrier.arrive_and_wait()
+
+        # Block-level reduction: only first epilogue warp's lane 0 handles this
+        if warp_idx == self.epilog_warp_id[0] and cute.arch.lane_idx() == 0:
+            block_amax = cutlass.Float32(0.0)
+            for i in cutlass.range(self.num_epilog_warps):
+                warp_amax_val = amax_smem[i]
+                block_amax = cute.arch.fmax(block_amax, warp_amax_val)
+
+            # Global atomic max (accumulates across all tiles for final tensor amax)
+            _ = atomic_max_float32(ptr=amax_gmem, value=block_amax)
+
+    @cute.jit
+    def store_c(
+        self,
+        tiled_copy_r2s,
+        tma_atom_c,
+        warp_idx,
+        tTR_rAcc,
+        tTR_rAcc_gate,
+        tRS_rC,
+        tRS_sC,
+        bSG_gC,
+        bSG_sC,
+        c_pipeline,
+        prev_subtile_idx,
+        real_subtile_idx,
+    ) -> None:
+        c_buffer = prev_subtile_idx % self.num_c_stage
+        tRS_rC.store(tTR_rAcc.load().to(self.c_dtype))
+        cute.copy(
+            tiled_copy_r2s,
+            tRS_rC[(None, None, 0)],
+            tRS_sC[(None, None, 0, c_buffer)],
+        )
+        tRS_rC.store(tTR_rAcc_gate.load().to(self.c_dtype))
+        cute.copy(
+            tiled_copy_r2s,
+            tRS_rC[(None, None, 0)],
+            tRS_sC[(None, None, 1, c_buffer)],  # ((1, 32), 1, 2, (1, 1)), ((0, 1), 0, 32, (0, 0))
+        )
+        # Fence and barrier to make sure shared memory store is visible to TMA store
+        cute.arch.fence_proxy(
+            cute.arch.ProxyKind.async_shared,
+            space=cute.arch.SharedSpace.shared_cta,
+        )
+        self.epilog_sync_barrier.arrive_and_wait()
+        #
+        # TMA store smem to global memory
+        #
+        if warp_idx == self.epilog_warp_id[0]:
+            cute.copy(
+                tma_atom_c,
+                bSG_sC[(None, c_buffer)],  # ((8192, 1), (1, 4)), ((1, 0), (0, 8192))
+                bSG_gC[(None, real_subtile_idx)],  # (((64, 128), 1), (1, 4)) : (((1@0,1@1),0),(0,64@0))
+            )
+            # Fence and barrier to make sure shared memory store is visible to TMA store
+            c_pipeline.producer_commit()
+            c_pipeline.producer_acquire()
+        self.epilog_sync_barrier.arrive_and_wait()
+
+    @cute.jit
+    def quant_sfd_row(
+        self,
+        tile_idx,
+        tiled_copy_r2s,
+        src,
+        pvscale,
+        norm_const,
+        rcp_limit,
+        tRSrD,
+        tile_info,
+    ) -> None:
+        # Get absolute max across a vector and Compute SFD
+        tTR_rAcc_frg = cute.logical_divide(src, cute.make_layout(self.sf_vec_size))
+        acc_frg = tTR_rAcc_frg.load()
+        abs_acc_frg_ir = cutlass._mlir.dialects.math.absf(acc_frg.ir_value())
+        abs_acc_frg = type(acc_frg)(abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype)
+
+        pvscale_f32x4 = cute.make_rmem_tensor(4, cutlass.Float32)
+        sfd_f8x4 = cute.make_rmem_tensor(4, self.sf_dtype)
+        tmp_f32 = (
+            abs_acc_frg[None, 0].reduce(
+                cute.ReductionOp.MAX,
+                cutlass.Float32(0.0),
+                0,  # Use 0.0 as init for abs values
+            )
+            * rcp_limit
+            * norm_const
+        )
+        #
+        # Manually store pvscale to avoid spilling
+        #
+        if tile_idx == 0:
+            pvscale[0] = tmp_f32
+        elif tile_idx == 1:
+            pvscale[1] = tmp_f32
+        elif tile_idx == 2:
+            pvscale[2] = tmp_f32
+        elif tile_idx == 3:
+            pvscale[3] = tmp_f32
+
+        #
+        # Compute quantized output values and convert to D type
+        #
+        pvscale_f32x4[0] = tmp_f32
+        sfd_f8x4.store(pvscale_f32x4.load().to(self.sf_dtype))
+        pvscale_f32x4.store(sfd_f8x4.load().to(cutlass.Float32))
+        qpvscale_up = pvscale_f32x4[0]
+
+        fp32_max = cutlass.Float32(3.40282346638528859812e38)
+        acc_scale = norm_const * cute.arch.rcp_approx(qpvscale_up)
+        acc_scale = fmin(acc_scale, fp32_max, nan=True)
+        if cutlass.const_expr(self.vector_f32):
+            vec = tTR_rAcc_frg[None, 0]
+            for ei in cutlass.range_constexpr(0, self.sf_vec_size, 2):
+                (
+                    vec[ei],
+                    vec[ei + 1],
+                ) = cute.arch.mul_packed_f32x2(
+                    (vec[ei], vec[ei + 1]),
+                    (acc_scale, acc_scale),
+                    rnd=FPRoundingMode.RN,
+                    ftz=False,
+                )
+        else:
+            vec = tTR_rAcc_frg[None, 0]
+            for ei in cutlass.range_constexpr(self.sf_vec_size):
+                vec[ei] = vec[ei] * acc_scale
+
+        acc_vec = tiled_copy_r2s.retile(src).load()
+        tRSrD.store(acc_vec.to(self.d_dtype))
+
+    @cute.jit
+    def quant_sfd_col(
+        self,
+        tile_idx,
+        tiled_copy_r2s,
+        src,
+        pvscale,
+        norm_const,
+        rcp_limit,
+        tRSrD,
+        tile_info,
+    ) -> None:
+        # Get absolute max across a vector and Compute SFD
+        tTR_rAcc_frg = cute.logical_divide(src, cute.make_layout(self.sf_vec_size))
+        acc_frg = tTR_rAcc_frg.load()
+        abs_acc_frg_ir = cutlass._mlir.dialects.math.absf(acc_frg.ir_value())
+        acc_frg = type(acc_frg)(abs_acc_frg_ir, acc_frg.shape, acc_frg.dtype)
+
+        tmp_f32 = cutlass.Float32(0.0)
+        for vi in cutlass.range_constexpr(acc_frg.shape[0]):
+            max_value_original = (
+                cutlass.Float32(
+                    warp_redux_sync(
+                        value=acc_frg[vi, 0],
+                        kind=ReduxKind.MAX,
+                        mask_and_clamp=0xFFFFFFFF,
+                        nan=True,
+                    )
+                )
+                * rcp_limit
+                * norm_const
+            )
+            max_value_vec = cute.full(4, max_value_original, dtype=cutlass.Float32)
+            max_value_vec_f8 = max_value_vec.to(cutlass.Float8E8M0FNU)
+            max_value_vec_f32_chunked = max_value_vec_f8.to(cutlass.Float32)
+            max_value = max_value_vec_f32_chunked[0]
+            tidx = cute.arch.thread_idx()[0]
+            if tidx % 32 == vi:
+                tmp_f32 = max_value
+
+            acc_scale_col = cutlass.Float32(0.0)
+            if max_value_vec_f32_chunked[0] == 0.000000:
+                acc_scale_col = cutlass.Float32(0.0)
+            else:
+                acc_scale_col = norm_const * cute.arch.rcp_approx(max_value_vec_f32_chunked[0])
+            fp32_max = cutlass.Float32(3.40282346638528859812e38)
+            acc_scale_col = fmin(acc_scale_col, fp32_max)
+            tTR_rAcc_frg[vi] = tTR_rAcc_frg[vi] * acc_scale_col
+        pvscale[None, None, tile_idx][0] = tmp_f32
+
+        acc_vec = tiled_copy_r2s.retile(src).load()
+        tRSrD.store(acc_vec.to(self.d_dtype))
+
+    @cute.jit
+    def tile_info_to_mn_idx(
+        self,
+        tile_info: cute.Tensor,
+    ):
+        m_idx = tile_info[0] * cute.size(self.cta_tile_shape_mnk[0])
+        n_idx = tile_info[1] * cute.size(self.cta_tile_shape_mnk[1])
+        return m_idx, n_idx
+
+    @cute.jit
+    def create_and_partition_new_SFDCol(
+        self,
+        tile_info: cute.Tensor,
+        mSFDCol_mnl: cute.Tensor,
+    ):
+        m_idx, n_idx = self.tile_info_to_mn_idx(tile_info)
+        cumsum_tokens = tile_info[5]
+        tokens_this_group = tile_info[4]
+        n_total = cute.size(mSFDCol_mnl.shape[1])
+
+        sf_tile_idx_begin = cumsum_tokens // cute.size(mSFDCol_mnl.shape[0][0])
+        mSFDCol_mnl_new_ptr = mSFDCol_mnl[(None, sf_tile_idx_begin), None, 0].iterator
+
+        sfd_col_quant_layout = cute.tile_to_shape(
+            blockscaled_utils.BlockScaledBasicChunk(self.sf_vec_size, cute.nvgpu.tcgen05.OperandMajorMode.MN).layout,
+            (tokens_this_group, n_total, mSFDCol_mnl.shape[2]),
+            (1, 2, 3),
+        )
+        regPerSubtile = 4
+        sfd_tile = (
+            cute.make_layout(128),
+            cute.make_layout(32 * regPerSubtile),
+        )
+        mSFDCol_mnl_new = cute.make_tensor(mSFDCol_mnl_new_ptr, sfd_col_quant_layout)
+        gSFDCol_mnl_new = cute.local_tile(mSFDCol_mnl_new, sfd_tile, (None, None, None))
+
+        thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0))
+        val_layout = cute.make_ordered_layout((1,), order=(0,))
+        copy_atom_sfd_col_quant = cute.make_copy_atom(
+            cute.nvgpu.CopyUniversalOp(),
+            gSFDCol_mnl_new.element_type,
+            num_bits_per_copy=8,
+        )
+        tiled_copy_sfd_col_quant = cute.make_tiled_copy_tv(copy_atom_sfd_col_quant, thr_layout, val_layout)
+        tidx = cute.arch.thread_idx()[0]
+        thr_copy_sfd_col_quant = tiled_copy_sfd_col_quant.get_slice(tidx)
+        tCgSFDCol_mnl = thr_copy_sfd_col_quant.partition_D(cute.filter_zeros(gSFDCol_mnl_new))
+        tCgSFDCol_mnl = cute.filter_zeros(tCgSFDCol_mnl)
+        return tCgSFDCol_mnl
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        tma_atom_d: cute.CopyAtom,
+        mD_mnl: cute.Tensor,
+        tma_atom_d_col: cute.CopyAtom,
+        mD_col_mnl: cute.Tensor,
+        mSFDRow_mnl: Optional[cute.Tensor],
+        mSFDCol_mnl: Optional[cute.Tensor],
+        norm_const_tensor: Optional[cute.Tensor],
+        mAmax_tensor: Optional[cute.Tensor],
+        tile_idx_to_expert_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        m_split_cumsum: Optional[cute.Tensor],
+        alpha: cute.Tensor,
+        prob: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        d_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        tile_sched_params: PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+        lane_idx = cute.arch.lane_idx()
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+            cpasync.prefetch_descriptor(tma_atom_d)
+            if cutlass.const_expr(self.generate_sfd):
+                cpasync.prefetch_descriptor(tma_atom_d_col)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_tma_producer)
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, num_acc_consumer_threads)
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize tile info pipeline (barrier) and states
+        tile_info_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 1,
+        )
+        tile_info_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_wo_sched,
+        )
+        tile_info_pipeline = pipeline.PipelineAsync.create(
+            barrier_storage=storage.tile_info_mbar_ptr.data_ptr(),
+            num_stages=self.num_tile_stage,
+            producer_group=tile_info_pipeline_producer_group,
+            consumer_group=tile_info_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/D/Scale
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner)
+        sD = storage.sD.get_tensor(d_smem_layout_staged.outer, swizzle=d_smem_layout_staged.inner)
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        # placeholder again
+        sD_col = sD
+        if cutlass.const_expr(self.generate_sfd):
+            sD_col = storage.sD_col.get_tensor(d_smem_layout_staged.outer, swizzle=d_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # Shared memory for amax reduction (one FP32 per epilogue warp)
+        # Simple 1D layout. The allocation always here if no amax is generated,
+        # as the overhead is minimal and we want to keep the code simple.
+        amax_layout = cute.make_layout((self.num_epilog_warps,))
+        sAmax = storage.sAmax.get_tensor(amax_layout)
+        # (bidx, bidy, bidz, valid)
+        info_layout = cute.make_layout((4, self.num_tile_stage), stride=(1, 4))
+        if cutlass.const_expr(self.discrete_col_sfd):
+            info_layout = cute.make_layout((6, self.num_tile_stage), stride=(1, 6))
+        sInfo = storage.sInfo.get_tensor(info_layout)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1)
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2)
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1)
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None))
+
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None))
+
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None))
+        gD_mnl = cute.local_tile(mD_mnl, cute.slice_(self.mma_tiler_d, (None, None, 0)), (None, None, None))
+        # placeholder, it will be eventually removed by compiler as we won't do any store to it in FP4 mode
+        gD_col_mnl = gD_mnl
+        if cutlass.const_expr(self.generate_sfd):
+            gD_col_mnl = cute.local_tile(
+                mD_col_mnl,
+                cute.slice_(self.mma_tiler_d, (None, None, 0)),
+                (None, None, None),
+            )
+        k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/D
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+        tCgD = thr_mma.partition_C(gD_mnl)
+        # placeholder, same as above
+        tCgD_col = tCgD
+        if cutlass.const_expr(self.generate_sfd):
+            tCgD_col = thr_mma.partition_C(gD_col_mnl)
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/D
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        if cutlass.const_expr(self.overlapping_accum):
+            num_acc_stage_overlapped = 2
+            tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage_overlapped))
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = cute.make_tensor(
+                tCtAcc_fake.iterator,
+                cute.make_layout(
+                    tCtAcc_fake.shape,
+                    stride=(
+                        tCtAcc_fake.stride[0],
+                        tCtAcc_fake.stride[1],
+                        tCtAcc_fake.stride[2],
+                        (256 - self.num_sf_tmem_cols) * tCtAcc_fake.stride[0][1],
+                    ),
+                ),
+            )
+        else:
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        #
+        # Specialized Schedule warp
+        #
+        if warp_idx == self.sched_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            tile_info_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_tile_stage)
+
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                mma_tile_coord_m = cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape)
+                if mma_tile_coord_m < num_non_exiting_tiles[0]:
+                    tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                    cur_tile_coord = work_tile.tile_idx
+                    expert_idx = tile_idx_to_expert_idx[mma_tile_coord_m]
+                    with cute.arch.elect_one():
+                        sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[0]
+                        sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[1]
+                        sInfo[(2, tile_info_producer_state.index)] = expert_idx
+                        sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(work_tile.is_valid_tile)
+                        if cutlass.const_expr(self.discrete_col_sfd):
+                            tokens_presum_this_group = m_split_cumsum[expert_idx]
+                            tokens_presum_next_group = m_split_cumsum[expert_idx + 1]
+                            # number of tokens in this group
+                            sInfo[(4, tile_info_producer_state.index)] = tokens_presum_next_group - tokens_presum_this_group
+                            # token prefix sum of this group
+                            sInfo[(5, tile_info_producer_state.index)] = tokens_presum_this_group
+                        # fence view async shared
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+
+                    self.sched_sync_barrier.arrive_and_wait()
+                    tile_info_pipeline.producer_commit(tile_info_producer_state)
+                    tile_info_producer_state.advance()
+
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            tile_info_pipeline.producer_acquire(tile_info_producer_state)
+            with cute.arch.elect_one():
+                sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0]
+                sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1]
+                sInfo[(2, tile_info_producer_state.index)] = -1
+                sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0)
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            self.sched_sync_barrier.arrive_and_wait()
+            tile_info_pipeline.producer_commit(tile_info_producer_state)
+            tile_info_producer_state.advance()
+            tile_info_pipeline.producer_tail(tile_info_producer_state)
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_ab_stage)
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_tile_stage)
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, 0)]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)]
+
+                # Apply SFB slicing hack when cta_tile_shape_n=64 # {$nv-internal-release}
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    tAgA_k = tAgA_slice[(None, ab_producer_state.count)]
+                    tBgB_k = tBgB_slice[(None, ab_producer_state.count)]
+                    tAgSFA_k = tAgSFA_slice[(None, ab_producer_state.count)]
+                    tBgSFB_k = tBgSFB_slice[(None, ab_producer_state.count)]
+                    tAsA_pipe = tAsA[(None, ab_producer_state.index)]
+                    tBsB_pipe = tBsB[(None, ab_producer_state.index)]
+                    tAsSFA_pipe = tAsSFA[(None, ab_producer_state.index)]
+                    tBsSFB_pipe = tBsSFB[(None, ab_producer_state.index)]
+
+                    tma_bar = ab_pipeline.producer_get_barrier(ab_producer_state)
+
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_k,
+                        tAsA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_k,
+                        tBsB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_k,
+                        tAsSFA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_k,
+                        tBsSFB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + self.num_accumulator_tmem_cols + self.num_sfa_tmem_cols,
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_ab_stage)
+            acd_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.num_acc_stage)
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_tile_stage)
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = 0
+                acd_producer_state.reset_count()
+                peek_acc_empty_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_acc_empty_status = acc_pipeline.producer_try_acquire(acd_producer_state)
+
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acd_producer_state.phase ^ 1
+                else:
+                    acc_stage_index = acd_producer_state.index
+
+                tCtAcc = tCtAcc_base[(None, None, None, acc_stage_index)]
+
+                # Apply TMEM pointer offset hack when cta_tile_shape_n=192 or cta_tile_shape_n=64 # {$nv-internal-release}
+
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for cta_tile_shape_n=192 case by two words (ignores first 64 columns of SFB)
+                    offset = cutlass.Int32(2) if mma_tile_coord_mnl[1] % 2 == 1 else cutlass.Int32(0)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr + self.num_accumulator_tmem_cols + self.num_sfa_tmem_cols + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr + self.num_accumulator_tmem_cols + self.num_sfa_tmem_cols + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                    #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acd_producer_state, peek_acc_empty_status)
+                #
+                # Mma mainloop
+                #
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Set tensor memory buffer for current tile
+                    # (MMA, MMA_M, MMA_N)
+
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Async arrive accumulator buffer full(each kblock)
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acd_producer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1
+                acd_producer_state.advance()
+                if acd_producer_state.count < k_tile_cnt:
+                    if is_leader_cta:
+                        peek_acc_empty_status = acc_pipeline.producer_try_acquire(acd_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acd_producer_state)
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc_up,
+                tTR_rAcc_gate,
+            ) = self.epilog_tmem_copy_and_partition(epi_tidx, tCtAcc_base, tCgD, epi_tile, use_2cta_instrs)
+
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(tiled_copy_t2r, tTR_rC, epi_tidx, sC)
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, self.epi_tile_c, sC)
+
+            tTR_rD = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.d_dtype)
+            tiled_copy_r2s, tRS_rD, tRS_sD = self.epilog_smem_copy_and_partition(tiled_copy_t2r, tTR_rD, epi_tidx, sD)
+            (
+                tma_atom_d,
+                bSG_sD,
+                bSG_gD_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_d, tCgD, epi_tile, sD)
+
+            tTR_rD_col = cute.make_rmem_tensor(tTR_rAcc_up.shape, self.d_dtype)
+            tiled_copy_r2s, tRS_rD_col, tRS_sD_col = self.epilog_smem_copy_and_partition(tiled_copy_t2r, tTR_rD_col, epi_tidx, sD_col)
+            (
+                tma_atom_d_col,
+                bSG_sD_col,
+                bSG_gD_col_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_d_col, tCgD_col, epi_tile, sD_col)
+
+            if cutlass.const_expr(self.generate_sfd):
+                norm_const = norm_const_tensor[0]
+                regPerSubtile = 4
+                sfd_row_tile = (
+                    cute.make_layout(128),
+                    cute.make_layout(32 * regPerSubtile),
+                )
+                # (EPI_TILE_M, EPI_TILE_N, RestM, RestN, RestL)
+                gSFDRow_mnl = cute.local_tile(mSFDRow_mnl, sfd_row_tile, (None, None, None))
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                # (T2R, T2R_M, T2R_N, RestM, RestN, RestL)
+                tCgSFDRow_mnl = thr_copy_t2r.partition_D(gSFDRow_mnl)
+                tCgSFDRow_mnl = cute.filter_zeros(tCgSFDRow_mnl)
+                # (T2R, T2R_M, T2R_N)
+                tCrSFDRow = cute.make_rmem_tensor(tCgSFDRow_mnl[(None, None, None, 0, 0, 0)].layout, self.sf_dtype)
+                tCrSFDRow_pvscale = cute.make_rmem_tensor_like(tCrSFDRow, cutlass.Float32)
+                d_rcp_limits = self.get_dtype_rcp_limits(self.d_dtype)
+
+                # both SFDs are stored in row major mode.
+                sfd_col_tile = sfd_row_tile
+                gSFDCol_mnl = cute.local_tile(mSFDCol_mnl, sfd_col_tile, (None, None, None))
+                thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0))
+                val_layout = cute.make_ordered_layout((1,), order=(0,))
+                copy_atom_sfd_col = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(),
+                    gSFDCol_mnl.element_type,
+                    num_bits_per_copy=8,
+                )
+                tiled_copy_sfd_col = cute.make_tiled_copy_tv(copy_atom_sfd_col, thr_layout, val_layout)
+                thr_copy_sfd_col = tiled_copy_sfd_col.get_slice(tidx)
+                tCgSFDCol_mnl = thr_copy_sfd_col.partition_D(cute.filter_zeros(gSFDCol_mnl))
+                tCgSFDCol_mnl = cute.filter_zeros(tCgSFDCol_mnl)
+                tCrSFDCol = cute.make_rmem_tensor(tCgSFDRow_mnl[(None, None, None, 0, 0, 0)].shape, self.sf_dtype)
+                tCrSFDCol_pvscale = cute.make_rmem_tensor_like(tCrSFDRow, cutlass.Float32)
+                tCrSFDCol_qpvscale_up_fp32 = cute.make_rmem_tensor_like(tCrSFDRow, cutlass.Float32)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_acc_stage)
+
+            c_pipeline = None
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            d_pipeline = None
+            # Threads/warps participating in tma store pipeline
+            d_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            d_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_d_stage,
+                producer_group=d_producer_group,
+            )
+            d_col_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_d_stage,
+                producer_group=d_producer_group,
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.num_tile_stage)
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            if cutlass.const_expr(self.discrete_col_sfd):
+                tile_info = cute.make_rmem_tensor((6,), cutlass.Int32)
+
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(tile_info.shape[0], unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            num_prev_subtiles = cutlass.Int32(0)
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Get alpha for current group
+                #
+
+                expert_idx = mma_tile_coord_mnl[2]
+                alpha_val = alpha[expert_idx]
+
+                #
+                # Slice to per mma tile index
+                #
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+                bSG_gD = bSG_gD_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+                bSG_gD_col = bSG_gD_col_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+                bSG_gD = cute.group_modes(bSG_gD, 1, cute.rank(bSG_gD))
+                bSG_gD_col = cute.group_modes(bSG_gD_col, 1, cute.rank(bSG_gD_col))
+
+                # Get accumulator stage index
+                if cutlass.const_expr(self.overlapping_accum):
+                    acc_stage_index = acc_consumer_state.phase
+                    reverse_subtile = cutlass.Boolean(True) if acc_stage_index == 0 else cutlass.Boolean(False)
+                else:
+                    acc_stage_index = acc_consumer_state.index
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_stage_index)]
+
+                if cutlass.const_expr(self.generate_sfd):
+                    # (T2R, T2R_M, T2R_N, RestM, RestN)
+                    tCgSFDRow_mn = tCgSFDRow_mnl[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            0,
+                        )
+                    ]
+                    tCgSFDCol_mnl_new = tCgSFDCol_mnl
+                    if cutlass.const_expr(self.discrete_col_sfd):
+                        tCgSFDCol_mnl_new = self.create_and_partition_new_SFDCol(tile_info, mSFDCol_mnl)
+                    tCgSFDCol_mn = tCgSFDCol_mnl_new[
+                        (
+                            None,
+                            None,
+                            None,
+                            None,
+                            None,
+                            0,
+                        )
+                    ]
+
+                if cutlass.const_expr(self.generate_amax):
+                    thread_tile_amax = cutlass.Float32(0.0)
+
+                #
+                # Get PROB
+                # Note, it always assumes T2R_M/EPI_M is 1, otherwise it will break the result.
+                #
+                mPosition = tile_info[0] * self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape) + tidx
+                mProb = prob[mPosition, 0, 0]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+
+                #
+                # Store accumulator to global memory in subtiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                for subtile_idx in cutlass.range(0, subtile_cnt, 2, unroll=1):
+                    real_subtile_idx = subtile_idx // 2
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if reverse_subtile:
+                            # Subtile always iterates on N dimension as we only have 4x1DP tmem load pattern for cta_tile_m = 128 cases. # {$nv-internal-release}
+                            real_subtile_idx = self.cta_tile_shape_mnk[1] // self.epi_tile_n_required - 1 - subtile_idx // 2
+
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn_up = tTR_tAcc[(None, None, None, real_subtile_idx * 2)]
+                    tTR_tAcc_mn_gate = tTR_tAcc[(None, None, None, real_subtile_idx * 2 + 1)]
+
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_up, tTR_rAcc_up)
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn_gate, tTR_rAcc_gate)
+
+                    #
+                    # Async arrive accumulator buffer empty ealier when overlapping_accum is enabled
+                    #
+                    if cutlass.const_expr(self.overlapping_accum):
+                        if subtile_idx == self.iter_acc_early_release_in_epilogue:
+                            # Fence for TMEM load
+                            cute.arch.fence_view_async_tmem_load()
+                            with cute.arch.elect_one():
+                                acc_pipeline.consumer_release(acc_consumer_state)
+                            acc_consumer_state.advance()
+
+                    #
+                    # Apply alpha
+                    #
+                    if cutlass.const_expr(self.vector_f32):
+                        for i in cutlass.range_constexpr(0, cute.size(tTR_rAcc_up), 2):
+                            tTR_rAcc_up[i], tTR_rAcc_up[i + 1] = cute.arch.mul_packed_f32x2(
+                                (tTR_rAcc_up[i], tTR_rAcc_up[i + 1]),
+                                (
+                                    cutlass.Float32(alpha_val),
+                                    cutlass.Float32(alpha_val),
+                                ),
+                                rnd=FPRoundingMode.RN,
+                                ftz=False,
+                            )
+                            tTR_rAcc_gate[i], tTR_rAcc_gate[i + 1] = cute.arch.mul_packed_f32x2(
+                                (tTR_rAcc_gate[i], tTR_rAcc_gate[i + 1]),
+                                (
+                                    cutlass.Float32(alpha_val),
+                                    cutlass.Float32(alpha_val),
+                                ),
+                                rnd=FPRoundingMode.RN,
+                                ftz=False,
+                            )
+                    else:
+                        for i in cutlass.range_constexpr(cute.size(tTR_rAcc_up)):
+                            tTR_rAcc_up[i] = tTR_rAcc_up[i] * cutlass.Float32(alpha_val)
+                            tTR_rAcc_gate[i] = tTR_rAcc_gate[i] * cutlass.Float32(alpha_val)
+
+                    #
+                    # Store to C tensor
+                    #
+                    self.store_c(
+                        tiled_copy_r2s,
+                        tma_atom_c,
+                        warp_idx,
+                        tTR_rAcc_up,
+                        tTR_rAcc_gate,
+                        tRS_rC,
+                        tRS_sC,
+                        bSG_gC,
+                        bSG_sC,
+                        c_pipeline,
+                        num_prev_subtiles,
+                        real_subtile_idx,
+                    )
+
+                    acc_vec_up = tTR_rAcc_up.load()
+                    acc_vec_gate = tTR_rAcc_gate.load()
+
+                    # SwiGlu
+                    tCompute = cute.make_rmem_tensor(acc_vec_gate.shape, self.acc_dtype)
+                    if cutlass.const_expr(self.vector_f32):
+                        # SwiGlu Packed Version
+                        LOG2_E = cutlass.Float32(1.4426950408889634)
+                        for i in cutlass.range_constexpr(0, cute.size(tTR_rAcc_up), 2):
+                            tCompute_log2e = cute.arch.mul_packed_f32x2(
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                                (-LOG2_E, -LOG2_E),
+                                rnd=FPRoundingMode.RN,
+                                ftz=False,
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.add_packed_f32x2(
+                                (
+                                    cute.math.exp2(tCompute_log2e[0], fastmath=True),
+                                    cute.math.exp2(tCompute_log2e[1], fastmath=True),
+                                ),
+                                (1.0, 1.0),
+                            )
+                            tCompute[i] = cute.arch.rcp_approx(tCompute[i])
+                            tCompute[i + 1] = cute.arch.rcp_approx(tCompute[i + 1])
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_gate[i], acc_vec_gate[i + 1]),
+                                rnd=FPRoundingMode.RN,
+                                ftz=False,
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (acc_vec_up[i], acc_vec_up[i + 1]),
+                                rnd=FPRoundingMode.RN,
+                                ftz=False,
+                            )
+                            (
+                                tCompute[i],
+                                tCompute[i + 1],
+                            ) = cute.arch.mul_packed_f32x2(
+                                (tCompute[i], tCompute[i + 1]),
+                                (mProb, mProb),
+                                rnd=FPRoundingMode.RN,
+                                ftz=False,
+                            )
+                    else:
+                        # SwiGlu Unpacked Version
+                        for i in cutlass.range_constexpr(cute.size(tTR_rAcc_up)):
+                            tCompute[i] = acc_vec_up[i] * silu_f32(acc_vec_gate[i], fastmath=True)
+                            tCompute[i] = tCompute[i] * mProb
+
+                    #
+                    # Generate amax
+                    #
+                    if cutlass.const_expr(self.generate_amax):
+                        thread_tile_amax = self.amax_reduction_per_thread(tCompute, thread_tile_amax)
+
+                    if cutlass.const_expr(self.generate_sfd):
+                        tCompute_col = cute.make_rmem_tensor(tCompute.layout, tCompute.element_type)
+                        tCompute_col.store(tCompute.load())
+                        #
+                        # Generate row major SFD
+                        #
+                        self.quant_sfd_row(
+                            real_subtile_idx,
+                            tiled_copy_r2s,
+                            tCompute,
+                            tCrSFDRow_pvscale,
+                            norm_const,
+                            d_rcp_limits,
+                            tRS_rD,
+                            tile_info,
+                        )
+                        #
+                        # Generate col major SFD
+                        #
+                        self.quant_sfd_col(
+                            real_subtile_idx,
+                            tiled_copy_r2s,
+                            tCompute_col,
+                            tCrSFDCol_pvscale,
+                            norm_const,
+                            d_rcp_limits,
+                            tRS_rD_col,
+                            tile_info,
+                        )
+
+                        # Assume subtile partitioned always happens on n dimension
+                        sfd_row_idx_mn = (
+                            tile_info[0],
+                            tile_info[1],
+                        )
+                        sfd_col_idx_mn = sfd_row_idx_mn
+                        if cutlass.const_expr(self.discrete_col_sfd):
+                            sfd_col_idx_mn = (
+                                tile_info[0] - tile_info[5] // 128,
+                                tile_info[1],
+                            )
+                        tCgSFDRow = tCgSFDRow_mn[
+                            (
+                                None,
+                                None,
+                                None,
+                                *sfd_row_idx_mn,
+                            )
+                        ]
+                        tCgSFDCol = tCgSFDCol_mn[
+                            (
+                                None,
+                                None,
+                                None,
+                                *sfd_col_idx_mn,
+                            )
+                        ]
+
+                        if subtile_idx == 6:
+                            tCrSFDRow.store(tCrSFDRow_pvscale.load().to(self.sf_dtype))
+                            cute.autovec_copy(tCrSFDRow, tCgSFDRow)
+                            tCrSFDCol.store(tCrSFDCol_pvscale.load().to(self.sf_dtype))
+                            cute.autovec_copy(tCrSFDCol, tCgSFDCol)
+                    else:
+                        #
+                        # Convert to D type
+                        #
+                        acc_vec = tiled_copy_r2s.retile(tCompute).load()
+                        tRS_rD.store(acc_vec.to(self.d_dtype))
+
+                    #
+                    # Store D to shared memory
+                    #
+                    d_buffer = num_prev_subtiles % self.num_d_stage
+                    num_prev_subtiles = num_prev_subtiles + 1
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rD,
+                        tRS_sD[(None, None, None, d_buffer)],
+                    )
+                    if cutlass.const_expr(self.generate_sfd):
+                        cute.copy(
+                            tiled_copy_r2s,
+                            tRS_rD_col,
+                            tRS_sD_col[(None, None, None, d_buffer)],
+                        )
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store D to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_d,
+                            bSG_sD[(None, d_buffer)],
+                            bSG_gD[(None, real_subtile_idx)],
+                        )
+                        if cutlass.const_expr(self.generate_sfd):
+                            cute.copy(
+                                tma_atom_d_col,
+                                bSG_sD_col[(None, d_buffer)],
+                                bSG_gD_col[(None, real_subtile_idx)],
+                            )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        d_pipeline.producer_commit()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                if cutlass.const_expr(not self.overlapping_accum):
+                    with cute.arch.elect_one():
+                        acc_pipeline.consumer_release(acc_consumer_state)
+                    acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(tile_info.shape[0], unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+
+                # Perform amax reduction after all subtiles are processed
+                if cutlass.const_expr(self.generate_amax):
+                    gAmax = mAmax_tensor[(expert_idx, None)].iterator.llvm_ptr  # First element
+                    self.amax_reduction_per_warp_and_cta(thread_tile_amax, warp_idx, sAmax, gAmax)
+
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(tmem_ptr)
+            #
+            # Wait for C/D store complete
+            #
+            c_pipeline.producer_tail()
+            d_pipeline.producer_tail()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gD_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gD_mnl: The global tensor D
+        :type gD_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc_up: The partitioned accumulator tensor for acc up
+            - tTR_rAcc_gate: The partitioned accumulator tensor for acc gate
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.d_layout,
+            self.d_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gD_mnl_epi = cute.flat_divide(gD_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gD_mnl_epi)
+
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_up = cute.make_rmem_tensor(tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc_gate = cute.make_rmem_tensor(tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype)
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc_up, tTR_rAcc_gate
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sD: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sD: The shared memory tensor to be copied and partitioned
+        :type sD: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rD, tRS_sD) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rD: The partitioned tensor D (register source)
+            - tRS_sD: The partitioned tensor D (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(self.d_layout, self.d_dtype, self.acc_dtype, tiled_copy_t2r)
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sD = thr_copy_r2s.partition_D(sD)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rD = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rD, tRS_sD
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gD_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sD: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gD_mnl: The global tensor D
+        :type gD_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sD: The shared memory tensor to be copied and partitioned
+        :type sD: cute.Tensor
+
+        :return: A tuple containing :
+            - For TMA store: (tma_atom_d, bSG_sD, bSG_gD) where:
+                - tma_atom_d: The TMA copy atom
+                - bSG_sD: The partitioned shared memory tensor D
+                - bSG_gD: The partitioned global tensor D
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gD_epi = cute.flat_divide(gD_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        tma_atom_d = atom
+        sD_for_tma_partition = cute.group_modes(sD, 0, 2)
+        gD_for_tma_partition = cute.group_modes(gD_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sD, bSG_gD = cpasync.tma_partition(
+            tma_atom_d,
+            0,
+            cute.make_layout(1),
+            sD_for_tma_partition,
+            gD_for_tma_partition,
+        )
+        return tma_atom_d, bSG_sD, bSG_gD
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        epi_tile_c: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        d_dtype: Type[cutlass.Numeric],
+        d_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        num_smem_capacity: int,
+        occupancy: int,
+        generate_sfd: bool,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/D operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param d_layout: Layout of operand D.
+        :type d_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Vector size of scale factor.
+        :type sf_vec_size: int
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, D stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # Default C/D stages
+        num_c_stage = 2 if generate_sfd else 1
+        num_d_stage = 2 if generate_sfd else 1
+
+        # Default Tile info stages
+        num_tile_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and D
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile_c,
+            1,
+        )
+
+        d_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            d_dtype,
+            d_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        # Mbar bytes
+        mbar_helpers_bytes = 1024
+        # Sinfo bytes
+        sinfo_bytes = 4 * 4 * num_tile_stage
+        # C/D bytes
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+        d_bytes_per_stage = cute.size_in_bytes(d_dtype, d_smem_layout_staged_one)
+        d_bytes = d_bytes_per_stage * num_d_stage * (2 if generate_sfd else 1)
+        # AMAX bytes
+        amax_bytes = BlockScaledContiguousGroupedGemmKernel.get_amax_smem_size() if d_dtype == cutlass.BFloat16 else 0
+        # Epilogue bytes
+        epi_bytes = c_bytes + d_bytes + amax_bytes
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial D stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        num_ab_stage = (num_smem_capacity // occupancy - (mbar_helpers_bytes + epi_bytes + sinfo_bytes)) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        ##num_d_stage += (
+        ##    num_smem_capacity
+        ##    - occupancy * ab_bytes_per_stage * num_ab_stage
+        ##    - occupancy * (mbar_helpers_bytes + epi_bytes)
+        ##) // (occupancy * d_bytes_per_stage)
+
+        total_bytes = occupancy * (ab_bytes_per_stage * num_ab_stage + epi_bytes + sinfo_bytes + mbar_helpers_bytes)
+
+        ## Display stage information
+        ## cute.printf(
+        ##     f"generate_sfd: {generate_sfd}, num_acc_stage: {num_acc_stage}, num_ab_stage: {num_ab_stage}, num_c_stage: {num_c_stage}, num_d_stage: {num_d_stage}, num_tile_stage: {num_tile_stage}, total_bytes: {total_bytes}"
+        ## )
+        return num_acc_stage, num_ab_stage, num_c_stage, num_d_stage, num_tile_stage
+
+    @staticmethod
+    def _compute_grid(
+        output_shape: Tuple[int, int, int],
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor D.
+
+        :param d: The output tensor D
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        m, n, g = output_shape
+        d_layout = cute.make_layout(cute.shape(output_shape))
+        d_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gd = cute.zipped_divide(d_layout, tiler=d_shape)
+        num_ctas_mnl = cute.slice_(gd.shape, (0, (None, None, None)))
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        swizzle_n_blackwell = 2048
+        tile_sched_params = PersistentTileSchedulerParams(
+            num_ctas_mnl,
+            cluster_shape_mnl,
+            raster_along_m=False,
+            swizzle_size=swizzle_n_blackwell // cta_tile_shape_mnk[1],
+        )
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(tile_sched_params, max_active_clusters)
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def get_dtype_rcp_limits(dtype: Type[cutlass.Numeric]) -> float:
+        """
+        Calculates the reciprocal of the maximum absolute value for a given data type.
+
+        :param dtype: Data type
+        :type dtype: Type[cutlass.Numeric]
+
+        :return: An float representing the reciprocal of the maximum absolute value
+        :rtype: float
+        """
+        if dtype == cutlass.Float4E2M1FN:
+            return 1 / 6.0
+        if dtype == cutlass.Float8E4M3FN:
+            return 1 / 448.0
+        if dtype == cutlass.Float8E5M2:
+            return 1 / 128.0
+        return 1.0
+
+    @staticmethod
+    def get_amax_smem_size():
+        # Note: 4 is hardcoded for num_epilog_warps
+        return 4 * cute.size_in_bytes(cutlass.Float32, cute.make_layout((1,)))
+
+
+class BlockScaledContiguousGroupedGemmKernelNoDlpack:
+    """Wrapper around BlockScaledContiguousGroupedGemmKernel that avoids DLPack.
+
+    This wrapper constructs cute.Tensors directly from cute.Pointer, shapes, and
+    explicit layout orders for all operands. Useful when tensor dtypes (FP4, FP8)
+    aren't supported by DLPack on older PyTorch versions.
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        vector_f32: bool,
+        generate_sfd: bool,
+        discrete_col_sfd: bool,
+    ):
+        self.kernel = BlockScaledContiguousGroupedGemmKernel(
+            sf_vec_size=sf_vec_size,
+            acc_dtype=acc_dtype,
+            use_2cta_instrs=use_2cta_instrs,
+            mma_tiler_mn=mma_tiler_mn,
+            cluster_shape_mn=cluster_shape_mn,
+            vector_f32=vector_f32,
+            generate_sfd=generate_sfd,
+            discrete_col_sfd=discrete_col_sfd,
+        )
+
+    @cute.jit
+    def __call__(
+        self,
+        a_ptr: cute.Pointer,
+        a_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        a_order: cutlass.Constexpr[Tuple[int, int, int]],
+        b_ptr: cute.Pointer,
+        b_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        b_order: cutlass.Constexpr[Tuple[int, int, int]],
+        c_ptr: cute.Pointer,
+        c_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        c_order: cutlass.Constexpr[Tuple[int, int, int]],
+        d_ptr: cute.Pointer,
+        d_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        d_order: cutlass.Constexpr[Tuple[int, int, int]],
+        d_col_ptr: cute.Pointer,
+        d_col_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        d_col_order: cutlass.Constexpr[Tuple[int, int, int]],
+        sfa_ptr: cute.Pointer,
+        sfa_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfa_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfb_ptr: cute.Pointer,
+        sfb_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfb_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfd_row_ptr: Optional[cute.Pointer],
+        sfd_row_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfd_row_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfd_col_ptr: Optional[cute.Pointer],
+        sfd_col_shape: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        sfd_col_order: cutlass.Constexpr[Tuple[int, int, int, int, int, int]],
+        amax_ptr: Optional[cute.Pointer],
+        amax_shape: cutlass.Constexpr[Tuple[int, int]],
+        amax_order: cutlass.Constexpr[Tuple[int, int]],
+        norm_const_ptr: Optional[cute.Pointer],
+        norm_const_shape: cutlass.Constexpr[Tuple[int]],
+        norm_const_order: cutlass.Constexpr[Tuple[int]],
+        tile_idx_to_expert_idx_ptr: cute.Pointer,
+        tile_idx_to_expert_idx_shape: cutlass.Constexpr[Tuple[int]],
+        tile_idx_to_expert_idx_order: cutlass.Constexpr[Tuple[int]],
+        num_non_exiting_tiles_ptr: cute.Pointer,
+        num_non_exiting_tiles_shape: cutlass.Constexpr[Tuple[int]],
+        num_non_exiting_tiles_order: cutlass.Constexpr[Tuple[int]],
+        m_split_cumsum_ptr: Optional[cute.Pointer],
+        m_split_cumsum_shape: cutlass.Constexpr[Tuple[int]],
+        m_split_cumsum_order: cutlass.Constexpr[Tuple[int]],
+        alpha_ptr: cute.Pointer,
+        alpha_shape: cutlass.Constexpr[Tuple[int]],
+        alpha_order: cutlass.Constexpr[Tuple[int]],
+        prob_ptr: Optional[cute.Pointer],
+        prob_shape: cutlass.Constexpr[Tuple[int, int, int]],
+        prob_order: cutlass.Constexpr[Tuple[int, int, int]],
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation using raw pointers and shapes.
+
+        See BlockScaledContiguousGroupedGemmKernel.__call__ for parameter descriptions.
+        """
+        # Construct cute.Tensors from pointers and shapes
+        a_cute = cute.make_tensor(a_ptr, layout=cute.make_ordered_layout(a_shape, order=a_order))
+        b_cute = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout(b_shape, order=b_order))
+        c_cute = cute.make_tensor(c_ptr, layout=cute.make_ordered_layout(c_shape, order=c_order))
+        d_cute = cute.make_tensor(d_ptr, layout=cute.make_ordered_layout(d_shape, order=d_order))
+        d_col_cute = cute.make_tensor(d_col_ptr, layout=cute.make_ordered_layout(d_col_shape, order=d_col_order))
+        sfa_cute = cute.make_tensor(sfa_ptr, layout=cute.make_ordered_layout(sfa_shape, order=sfa_order))
+        sfb_cute = cute.make_tensor(sfb_ptr, layout=cute.make_ordered_layout(sfb_shape, order=sfb_order))
+        tile_idx_cute = cute.make_tensor(
+            tile_idx_to_expert_idx_ptr,
+            layout=cute.make_ordered_layout(tile_idx_to_expert_idx_shape, order=tile_idx_to_expert_idx_order),
+        )
+        num_tiles_cute = cute.make_tensor(
+            num_non_exiting_tiles_ptr,
+            layout=cute.make_ordered_layout(num_non_exiting_tiles_shape, order=num_non_exiting_tiles_order),
+        )
+        alpha_cute = cute.make_tensor(alpha_ptr, layout=cute.make_ordered_layout(alpha_shape, order=alpha_order))
+
+        # Optional tensors
+        sfd_row_cute = None
+        if cutlass.const_expr(sfd_row_ptr is not None):
+            sfd_row_cute = cute.make_tensor(
+                sfd_row_ptr,
+                layout=cute.make_ordered_layout(sfd_row_shape, order=sfd_row_order),
+            )
+
+        sfd_col_cute = None
+        if cutlass.const_expr(sfd_col_ptr is not None):
+            sfd_col_cute = cute.make_tensor(
+                sfd_col_ptr,
+                layout=cute.make_ordered_layout(sfd_col_shape, order=sfd_col_order),
+            )
+
+        amax_cute = None
+        if cutlass.const_expr(amax_ptr is not None):
+            amax_cute = cute.make_tensor(amax_ptr, layout=cute.make_ordered_layout(amax_shape, order=amax_order))
+
+        norm_const_cute = None
+        if cutlass.const_expr(norm_const_ptr is not None):
+            norm_const_cute = cute.make_tensor(
+                norm_const_ptr,
+                layout=cute.make_ordered_layout(norm_const_shape, order=norm_const_order),
+            )
+
+        m_split_cumsum_cute = None
+        if cutlass.const_expr(m_split_cumsum_ptr is not None):
+            m_split_cumsum_cute = cute.make_tensor(
+                m_split_cumsum_ptr,
+                layout=cute.make_ordered_layout(m_split_cumsum_shape, order=m_split_cumsum_order),
+            )
+
+        prob_cute = None
+        if cutlass.const_expr(prob_ptr is not None):
+            prob_cute = cute.make_tensor(prob_ptr, layout=cute.make_ordered_layout(prob_shape, order=prob_order))
+
+        self.kernel(
+            a=a_cute,
+            b=b_cute,
+            c=c_cute,
+            d=d_cute,
+            d_col=d_col_cute,
+            sfa=sfa_cute,
+            sfb=sfb_cute,
+            sfd_row_tensor=sfd_row_cute,
+            sfd_col_tensor=sfd_col_cute,
+            amax_tensor=amax_cute,
+            norm_const_tensor=norm_const_cute,
+            tile_idx_to_expert_idx=tile_idx_cute,
+            num_non_exiting_tiles=num_tiles_cute,
+            m_split_cumsum=m_split_cumsum_cute,
+            alpha=alpha_cute,
+            prob=prob_cute,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+            epilogue_op=epilogue_op,
+        )
+
+
+@cute.jit
+def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+    sf_ref_tensor: cute.Tensor,
+    sf_mma_tensor: cute.Tensor,
+):
+    """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+    # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+    # group to ((32, 4, rest_m), (4, rest_k), l)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+    sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+    for i in cutlass.range(cute.size(sf_ref_tensor)):
+        mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+        sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
diff --git a/third_party/cudnn-frontend/python/cudnn/grouped_gemm/utils.py b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/utils.py
new file mode 100644
index 00000000..522141c9
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/grouped_gemm/utils.py
@@ -0,0 +1,851 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+Shared utilities for grouped GEMM kernels.
+
+This module contains the tile scheduler classes and helper functions used by both
+the forward (grouped_gemm_swiglu) and backward (grouped_gemm_dswiglu) kernels.
+"""
+
+from typing import Tuple, Union
+
+from cutlass.cutlass_dsl import (
+    Boolean,
+    Integer,
+    Int32,
+    min,
+    extract_mlir_values,
+    new_from_mlir_values,
+    dsl_user_op,
+    const_expr,
+)
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import scf, llvm, nvvm
+from cutlass.cutlass_dsl import T
+from cutlass.cute.typing import Float32, Int32 as CuteInt32
+import cutlass.cute as cute
+import cutlass
+
+##############################################################################
+# Helper functions
+##############################################################################
+
+
+def fmin(
+    a: Union[float, Float32],
+    b: Union[float, Float32],
+    *,
+    nan: bool = True,
+    loc=None,
+    ip=None,
+) -> Float32:
+    """Compute the minimum of two float32 values with NaN handling.
+
+    :param a: First operand
+    :param b: Second operand
+    :param nan: If True, propagate NaN values
+    :return: Minimum value
+    """
+    if nan:
+        ptx_instr = "min.NaN.f32 $0, $1, $2;"
+    else:
+        ptx_instr = "min.f32 $0, $1, $2;"
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip), Float32(b).ir_value(loc=loc, ip=ip)],
+            ptx_instr,
+            "=f,f,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+
+
+def warp_redux_sync(
+    value,
+    kind=None,
+    mask_and_clamp: int = 0xFFFFFFFF,
+    abs: bool = False,
+    nan: bool = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Perform a warp-level reduction synchronization for max with abs and NaN.
+
+    :param value: Value to reduce
+    :param kind: Reduction kind (unused, kept for API compatibility)
+    :param mask_and_clamp: Warp mask and clamp value
+    :param abs: Whether to use absolute value
+    :param nan: Whether to handle NaN values
+    :return: Reduced value across warp
+    """
+    value_type = type(value)
+    value_ir = value.ir_value(loc=loc, ip=ip)
+    mask_ir = Int32(mask_and_clamp).ir_value(loc=loc, ip=ip)
+    ptx_instr = "redux.sync.max.abs.NaN.f32 $0, $1, $2;"
+
+    return value_type(
+        llvm.inline_asm(
+            T.f32(),
+            [value_ir, mask_ir],
+            ptx_instr,
+            "=f,f,i",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+
+
+def atomic_max_float32(
+    ptr,
+    value: Float32,
+    *,
+    positive_only: bool = True,
+    loc=None,
+    ip=None,
+) -> Float32:
+    """Perform atomic max operation on a float32 value in global memory.
+
+    This implementation works correctly for non-negative values (>= 0) using direct bitcast.
+
+    :param ptr: Pointer to the memory location
+    :param value: The float32 value to compare and potentially store (should be >= 0)
+    :return: The old value at the memory location
+    """
+    value_int = llvm.bitcast(T.i32(), value.ir_value(loc=loc, ip=ip), loc=loc, ip=ip)
+
+    old_value_int = nvvm.atomicrmw(
+        res=T.i32(),
+        op=cutlass._mlir.dialects.nvvm.AtomicOpKind.MAX,
+        ptr=ptr,
+        a=value_int,
+        loc=loc,
+        ip=ip,
+    )
+
+    return Float32(llvm.bitcast(T.f32(), old_value_int, loc=loc, ip=ip))
+
+
+def atomic_add_float32(
+    ptr,
+    value: Float32,
+    *,
+    loc=None,
+    ip=None,
+) -> Float32:
+    """Perform atomic add operation on a float32 value in global memory.
+
+    :param ptr: Pointer to the memory location
+    :param value: The float32 value to add
+    :return: The old value at the memory location
+    """
+    old_value = nvvm.atomicrmw(
+        res=T.f32(),
+        op=cutlass._mlir.dialects.nvvm.AtomicOpKind.FADD,
+        ptr=ptr,
+        a=value.ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+    return Float32(old_value)
+
+
+def sigmoid_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
+    """Compute the sigmoid function: 1 / (1 + exp(-a)).
+
+    :param a: Input value
+    :param fastmath: Whether to use fast math approximations
+    :return: Sigmoid of input
+    """
+    return cute.arch.rcp_approx(1.0 + cute.math.exp(-a, fastmath=fastmath))
+
+
+def silu_f32(a: Union[float, Float32], fastmath: bool = False) -> Union[float, Float32]:
+    """Compute the SiLU (Swish) activation: a * sigmoid(a).
+
+    :param a: Input value
+    :param fastmath: Whether to use fast math approximations
+    :return: SiLU of input
+    """
+    return a * sigmoid_f32(a, fastmath=fastmath)
+
+
+##############################################################################
+# Static persistent tile scheduler
+##############################################################################
+
+
+class WorkTileInfo:
+    """A class to represent information about a work tile.
+
+    :ivar tile_idx: The index of the tile.
+    :type tile_idx: cute.Coord
+    :ivar is_valid_tile: Whether the tile is valid.
+    :type is_valid_tile: Boolean
+    """
+
+    def __init__(self, tile_idx: cute.Coord, is_valid_tile: Boolean):
+        self._tile_idx = tile_idx
+        self._is_valid_tile = Boolean(is_valid_tile)
+
+    def __extract_mlir_values__(self) -> list[ir.Value]:
+        values = extract_mlir_values(self.tile_idx)
+        values.extend(extract_mlir_values(self.is_valid_tile))
+        return values
+
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "WorkTileInfo":
+        assert len(values) == 4
+        new_tile_idx = new_from_mlir_values(self._tile_idx, values[:-1])
+        new_is_valid_tile = new_from_mlir_values(self._is_valid_tile, [values[-1]])
+        return WorkTileInfo(new_tile_idx, new_is_valid_tile)
+
+    @property
+    def is_valid_tile(self) -> Boolean:
+        """Check latest tile returned by the scheduler is valid or not.
+
+        Any scheduling requests after all tasks completed will return an invalid tile.
+
+        :return: The validity of the tile.
+        :rtype: Boolean
+        """
+        return self._is_valid_tile
+
+    @property
+    def tile_idx(self) -> cute.Coord:
+        """Get the index of the tile.
+
+        :return: The index of the tile.
+        :rtype: cute.Coord
+        """
+        return self._tile_idx
+
+
+class PersistentTileSchedulerParams:
+    """A class to represent parameters for a persistent tile scheduler.
+
+    This class is designed to manage and compute the layout of clusters and tiles
+    in a batched gemm problem.
+
+    :ivar cluster_shape_mn: Shape of the cluster in (m, n) dimensions (K dimension cta count must be 1).
+    :type cluster_shape_mn: tuple
+    :ivar problem_layout_ncluster_mnl: Layout of the problem in terms of
+        number of clusters in (m, n, l) dimensions.
+    :type problem_layout_ncluster_mnl: cute.Layout
+    """
+
+    @dsl_user_op
+    def __init__(
+        self,
+        problem_shape_ntile_mnl: cute.Shape,
+        cluster_shape_mnk: cute.Shape,
+        raster_along_m: bool = True,
+        swizzle_size: int = 1,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """Initializes the PersistentTileSchedulerParams with the given parameters.
+
+        :param problem_shape_ntile_mnl: The shape of the problem in terms of
+            number of CTA (Cooperative Thread Array) in (m, n, l) dimensions.
+        :type problem_shape_ntile_mnl: cute.Shape
+        :param cluster_shape_mnk: The shape of the cluster in (m, n) dimensions.
+        :type cluster_shape_mnk: cute.Shape
+        :param swizzle_size: Swizzling size in the unit of cluster. 1 means no swizzle
+        :type swizzle_size: int
+        :param raster_along_m: Rasterization order of clusters. Only used when swizzle_size > 1.
+            True means along M, false means along N.
+        :type raster_along_m: bool
+
+        :raises ValueError: If cluster_shape_k is not 1.
+        """
+
+        if cluster_shape_mnk[2] != 1:
+            raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
+        if swizzle_size < 1:
+            raise ValueError(f"expect swizzle_size >= 1, but get {swizzle_size}")
+
+        self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
+        # cluster_shape_mnk is kept for reconstruction
+        self._cluster_shape_mnk = cluster_shape_mnk
+        self.cluster_shape_mn = cluster_shape_mnk[:2]
+        self.swizzle_size = swizzle_size
+        self._raster_along_m = raster_along_m
+        self._loc = loc
+
+        # By default, we follow m major (col-major) raster order, so make a col-major layout
+        self.problem_layout_ncluster_mnl = cute.make_layout(
+            cute.ceil_div(self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+
+        # Apply swizzle if swizzle_size > 1
+        if swizzle_size > 1:
+            problem_shape_ncluster_mnl = cute.round_up(
+                self.problem_layout_ncluster_mnl.shape,
+                (1, swizzle_size, 1) if raster_along_m else (swizzle_size, 1, 1),
+            )
+
+            if raster_along_m:
+                self.problem_layout_ncluster_mnl = cute.make_layout(
+                    (
+                        problem_shape_ncluster_mnl[0],
+                        (swizzle_size, problem_shape_ncluster_mnl[1] // swizzle_size),
+                        problem_shape_ncluster_mnl[2],
+                    ),
+                    stride=(
+                        swizzle_size,
+                        (1, swizzle_size * problem_shape_ncluster_mnl[0]),
+                        problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                    ),
+                    loc=loc,
+                    ip=ip,
+                )
+            else:
+                self.problem_layout_ncluster_mnl = cute.make_layout(
+                    (
+                        (swizzle_size, problem_shape_ncluster_mnl[0] // swizzle_size),
+                        problem_shape_ncluster_mnl[1],
+                        problem_shape_ncluster_mnl[2],
+                    ),
+                    stride=(
+                        (1, swizzle_size * problem_shape_ncluster_mnl[1]),
+                        swizzle_size,
+                        problem_shape_ncluster_mnl[0] * problem_shape_ncluster_mnl[1],
+                    ),
+                    loc=loc,
+                    ip=ip,
+                )
+
+        # Create FastDivmod divisors (only when swizzle_size == 1 for correctness)
+        # FastDivmod assumes simple col-major layout, incompatible with swizzled layouts
+        if swizzle_size == 1:
+            problem_layout_size = cute.size(self.problem_layout_ncluster_mnl, loc=loc, ip=ip)
+            cluster_count_m = self.problem_layout_ncluster_mnl.shape[0]
+            cluster_count_n = self.problem_layout_ncluster_mnl.shape[1]
+
+            # batch_fdd: Used to map linear_idx to work_unit_id (handles persistent scheduling)
+            self.batch_fdd = cute.fast_divmod_create_divisor(problem_layout_size, loc=loc, ip=ip)
+
+            # cluster_shape_m_fdd: Used to decode work_unit_id to cluster coordinates
+            self.cluster_shape_m_fdd = cute.fast_divmod_create_divisor(cluster_count_m, loc=loc, ip=ip)
+
+            # cluster_shape_n_fdd: Used for the second level decomposition
+            self.cluster_shape_n_fdd = cute.fast_divmod_create_divisor(cluster_count_n, loc=loc, ip=ip)
+        else:
+            # FastDivmod not applicable with swizzling, set to None
+            self.batch_fdd = None
+            self.cluster_shape_m_fdd = None
+            self.cluster_shape_n_fdd = None
+
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [
+            self.problem_shape_ntile_mnl,
+            self._cluster_shape_mnk,
+            self._raster_along_m,
+            self.swizzle_size,
+        ]:
+            obj_values = extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+
+        # Add FastDivmod divisors to MLIR values for Host->Device transfer
+        # Only add non-None values to avoid MLIR type errors
+        fastdivmod_values = []
+        fastdivmod_indices = []  # Track which FastDivmod objects are present
+
+        for i, (fdd_name, fdd_obj) in enumerate(
+            [
+                ("batch_fdd", self.batch_fdd),
+                ("cluster_shape_m_fdd", self.cluster_shape_m_fdd),
+                ("cluster_shape_n_fdd", self.cluster_shape_n_fdd),
+            ]
+        ):
+            if fdd_obj is not None:
+                # Extract MLIR values from FastDivmodDivisor objects
+                fdd_values = extract_mlir_values(fdd_obj)
+                fastdivmod_values.extend(fdd_values)
+                fastdivmod_indices.append(i)
+
+        values += fastdivmod_values
+        self._values_pos.append(len(fastdivmod_indices))  # Store count of FastDivmod objects, not values
+        self._fastdivmod_indices = fastdivmod_indices  # Store for reconstruction
+
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        values_copy = list(values)  # Make a copy to avoid modifying original
+
+        # Reconstruct original objects from MLIR values
+        for obj, n_items in zip(
+            [
+                self.problem_shape_ntile_mnl,
+                self._cluster_shape_mnk,
+                self._raster_along_m,
+                self.swizzle_size,
+            ],
+            self._values_pos[:-1],  # Exclude FastDivmod count
+        ):
+            obj_list.append(new_from_mlir_values(obj, values_copy[:n_items]))
+            values_copy = values_copy[n_items:]
+
+        # Create new params object by calling __init__ with reconstructed values
+        # This properly recreates layouts and other derived attributes in the device context
+        new_params = PersistentTileSchedulerParams(*(tuple(obj_list)), loc=self._loc)
+
+        # Restore FastDivmod divisors from remaining values
+        fdd_names = ["batch_fdd", "cluster_shape_m_fdd", "cluster_shape_n_fdd"]
+
+        if hasattr(self, "_fastdivmod_indices") and len(self._fastdivmod_indices) > 0:
+            # Override the FastDivmod divisors created by __init__ with reconstructed ones
+            for j, original_index in enumerate(self._fastdivmod_indices):
+                fdd_name = fdd_names[original_index]
+                # Get the original FastDivmodDivisor object
+                original_fdd = getattr(self, fdd_name)
+                if original_fdd is not None and j < len(values_copy):
+                    # Each FastDivmodDivisor has 1 MLIR value
+                    reconstructed_fdd = new_from_mlir_values(original_fdd, [values_copy[j]])
+                    setattr(new_params, fdd_name, reconstructed_fdd)
+
+        return new_params
+
+    @dsl_user_op
+    def get_grid_shape(self, max_active_clusters: Int32, *, loc=None, ip=None) -> Tuple[Integer, Integer, Integer]:
+        """Computes the grid shape based on the maximum active clusters allowed.
+
+        :param max_active_clusters: The maximum number of active clusters that
+            can run in one wave.
+        :type max_active_clusters: Int32
+
+        :return: A tuple containing the grid shape in (m, n, persistent_clusters).
+            - m: self.cluster_shape_m.
+            - n: self.cluster_shape_n.
+            - persistent_clusters: Number of persistent clusters that can run.
+        """
+
+        # Total ctas in problem size
+        num_ctas_mnl = tuple(cute.size(x) * y for x, y in zip(self.problem_layout_ncluster_mnl.shape, self.cluster_shape_mn)) + (
+            self.problem_layout_ncluster_mnl.shape[2],
+        )
+
+        num_ctas_in_problem = cute.size(num_ctas_mnl, loc=loc, ip=ip)
+
+        num_ctas_per_cluster = cute.size(self.cluster_shape_mn, loc=loc, ip=ip)
+        # Total ctas that can run in one wave
+        num_ctas_per_wave = max_active_clusters * num_ctas_per_cluster
+
+        num_persistent_ctas = min(num_ctas_in_problem, num_ctas_per_wave)
+        num_persistent_clusters = num_persistent_ctas // num_ctas_per_cluster
+
+        return (*self.cluster_shape_mn, num_persistent_clusters)
+
+
+class StaticPersistentTileScheduler:
+    """A scheduler for static persistent tile execution in CUTLASS/CuTe kernels.
+
+    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
+    :type params: PersistentTileSchedulerParams
+    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
+    :type num_persistent_clusters: Int32
+    :ivar cta_id_in_cluster: ID of the CTA within its cluster
+    :type cta_id_in_cluster: cute.Coord
+    :ivar _num_tiles_executed: Counter for executed tiles
+    :type _num_tiles_executed: Int32
+    :ivar _current_work_linear_idx: Current cluster index
+    :type _current_work_linear_idx: Int32
+    """
+
+    def __init__(
+        self,
+        params: PersistentTileSchedulerParams,
+        num_persistent_clusters: Int32,
+        current_work_linear_idx: Int32,
+        cta_id_in_cluster: cute.Coord,
+        num_tiles_executed: Int32,
+    ):
+        """Initializes the StaticPersistentTileScheduler with the given parameters.
+
+        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
+        :type params: PersistentTileSchedulerParams
+        :param num_persistent_clusters: Number of persistent clusters that can be launched.
+        :type num_persistent_clusters: Int32
+        :param current_work_linear_idx: Current cluster index.
+        :type current_work_linear_idx: Int32
+        :param cta_id_in_cluster: ID of the CTA within its cluster.
+        :type cta_id_in_cluster: cute.Coord
+        :param num_tiles_executed: Counter for executed tiles.
+        :type num_tiles_executed: Int32
+        """
+        self.params = params
+        self.num_persistent_clusters = num_persistent_clusters
+        self._current_work_linear_idx = current_work_linear_idx
+        self.cta_id_in_cluster = cta_id_in_cluster
+        self._num_tiles_executed = num_tiles_executed
+
+    def __extract_mlir_values__(self) -> list[ir.Value]:
+        values = extract_mlir_values(self.num_persistent_clusters)
+        values.extend(extract_mlir_values(self._current_work_linear_idx))
+        values.extend(extract_mlir_values(self.cta_id_in_cluster))
+        values.extend(extract_mlir_values(self._num_tiles_executed))
+
+        # CRITICAL: Also extract FastDivmod divisors from params
+        values.extend(extract_mlir_values(self.params))
+
+        return values
+
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "StaticPersistentTileScheduler":
+        assert len(values) >= 6
+        new_num_persistent_clusters = new_from_mlir_values(self.num_persistent_clusters, [values[0]])
+        new_current_work_linear_idx = new_from_mlir_values(self._current_work_linear_idx, [values[1]])
+        new_cta_id_in_cluster = new_from_mlir_values(self.cta_id_in_cluster, values[2:5])
+        new_num_tiles_executed = new_from_mlir_values(self._num_tiles_executed, [values[5]])
+
+        # Reconstruct params with FastDivmod divisors
+        params_values = values[6:]  # Remaining values are from params
+        new_params = new_from_mlir_values(self.params, params_values)
+
+        return StaticPersistentTileScheduler(
+            new_params,  # Use reconstructed params with FastDivmod divisors
+            new_num_persistent_clusters,
+            new_current_work_linear_idx,
+            new_cta_id_in_cluster,
+            new_num_tiles_executed,
+        )
+
+    @staticmethod
+    @dsl_user_op
+    def create(
+        params: PersistentTileSchedulerParams,
+        block_idx: Tuple[Integer, Integer, Integer],
+        grid_dim: Tuple[Integer, Integer, Integer],
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """Initialize the static persistent tile scheduler.
+
+        :param params: Parameters for the persistent tile scheduler.
+        :type params: PersistentTileSchedulerParams
+        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
+        :type block_idx: Tuple[Integer, Integer, Integer]
+        :param grid_dim: The 3d grid dimensions for kernel launch.
+        :type grid_dim: Tuple[Integer, Integer, Integer]
+
+        :return: A StaticPersistentTileScheduler object.
+        :rtype: StaticPersistentTileScheduler
+        """
+
+        # Calculate the number of persistent clusters by dividing the total grid size
+        # by the number of CTAs per cluster
+        num_persistent_clusters = cute.size(grid_dim, loc=loc, ip=ip) // cute.size(params.cluster_shape_mn, loc=loc, ip=ip)
+
+        bidx, bidy, bidz = block_idx
+
+        # Initialize workload index equals to the cluster index in the grid
+        current_work_linear_idx = Int32(bidz)
+
+        # CTA id in the cluster
+        cta_id_in_cluster = (
+            Int32(bidx % params.cluster_shape_mn[0]),
+            Int32(bidy % params.cluster_shape_mn[1]),
+            Int32(0),
+        )
+        # Initialize number of tiles executed to zero
+        num_tiles_executed = Int32(0)
+        return StaticPersistentTileScheduler(
+            params,
+            num_persistent_clusters,
+            current_work_linear_idx,
+            cta_id_in_cluster,
+            num_tiles_executed,
+        )
+
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: PersistentTileSchedulerParams,
+        max_active_clusters: Int32,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Integer, Integer, Integer]:
+        """Calculates the grid shape to be launched on GPU using problem shape,
+        threadblock shape, and active cluster size.
+
+        :param params: Parameters for grid shape calculation.
+        :type params: PersistentTileSchedulerParams
+        :param max_active_clusters: Maximum active clusters allowed.
+        :type max_active_clusters: Int32
+
+        :return: The calculated 3d grid shape.
+        :rtype: Tuple[Integer, Integer, Integer]
+        """
+
+        return params.get_grid_shape(max_active_clusters, loc=loc, ip=ip)
+
+    # private method
+    def _get_current_work_for_linear_idx(self, current_work_linear_idx: Int32, *, loc=None, ip=None) -> WorkTileInfo:
+        """Compute current tile coord given current_work_linear_idx and cta_id_in_cluster.
+
+        :param current_work_linear_idx: The linear index of the current work.
+        :type current_work_linear_idx: Int32
+
+        :return: An object containing information about the current tile coordinates
+            and validity status.
+        :rtype: WorkTileInfo
+        """
+
+        is_valid = current_work_linear_idx < cute.size(self.params.problem_layout_ncluster_mnl, loc=loc, ip=ip)
+
+        # Choose coordinate calculation method based on swizzle configuration
+        if self.params.swizzle_size == 1:
+            # Use FastDivmod optimization for non-swizzled layouts
+            cur_cluster_coord = self._get_cluster_work_idx_with_fastdivmod(current_work_linear_idx, loc=loc, ip=ip)
+        else:
+            # Use get_flat_coord for swizzled layouts (FastDivmod doesn't support them)
+            cur_cluster_coord = self.params.problem_layout_ncluster_mnl.get_flat_coord(current_work_linear_idx, loc=loc, ip=ip)
+
+        # cur_tile_coord is a tuple of i32 values
+        cur_tile_coord = tuple(
+            Int32(x) * Int32(z) + Int32(y)
+            for x, y, z in zip(
+                cur_cluster_coord,
+                self.cta_id_in_cluster,
+                (*self.params.cluster_shape_mn, Int32(1)),
+            )
+        )
+
+        return WorkTileInfo(cur_tile_coord, is_valid)
+
+    def _get_cluster_work_idx_with_fastdivmod(self, current_work_linear_idx: Int32, *, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+        """FastDivmod optimized CLUSTER coordinate calculation.
+
+        CRITICAL: This should mimic problem_layout_ncluster_mnl.get_hier_coord()
+        which returns CLUSTER coordinates, not tile coordinates!
+
+        :param current_work_linear_idx: Linear index in the work space
+        :type current_work_linear_idx: Int32
+        :return: Cluster coordinates (m, n, l) or None if FastDivmod not available
+        :rtype: Tuple[Int32, Int32, Int32] or None
+        """
+
+        # Step 1: Handle persistent scheduling - map linear_idx to work_unit_id
+        work_iteration, work_unit_id = divmod(current_work_linear_idx, self.params.batch_fdd)
+
+        # Step 2: Decode work_unit_id using FastDivmod objects
+        # The layout structure is: problem_layout_ncluster_mnl has shape (cluster_count_m, cluster_count_n, batch_count)
+        # work_unit_id needs to be decomposed into (batch_l, cluster_n, cluster_m) in little-endian order
+
+        # First, get cluster_m using cluster_shape_m_fdd
+        cluster_n_batch, cluster_m = divmod(work_unit_id, self.params.cluster_shape_m_fdd)
+
+        # Then decode cluster_n_batch to get cluster_n and batch_l using FastDivmod
+        batch_l, cluster_n = divmod(cluster_n_batch, self.params.cluster_shape_n_fdd)
+
+        return (cluster_m, cluster_n, batch_l)
+
+    @dsl_user_op
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        return self._get_current_work_for_linear_idx(self._current_work_linear_idx, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def initial_work_tile_info(self, *, loc=None, ip=None) -> WorkTileInfo:
+        return self.get_current_work(loc=loc, ip=ip)
+
+    @dsl_user_op
+    def advance_to_next_work(self, *, advance_count: int = 1, loc=None, ip=None):
+        self._current_work_linear_idx += Int32(advance_count) * Int32(self.num_persistent_clusters)
+        self._num_tiles_executed += Int32(1)
+
+    @property
+    def num_tiles_executed(self) -> Int32:
+        return self._num_tiles_executed
+
+
+class StaticPersistentRuntimeTileScheduler(StaticPersistentTileScheduler):
+    """A scheduler for static persistent runtime tile execution in CUTLASS/CuTe kernels.
+
+    This scheduler will always launch all the SMs and the scheduler will generate
+    the real tile info for each SM.
+
+    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
+    :type params: PersistentTileSchedulerParams
+    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
+    :type num_persistent_clusters: Int32
+    :ivar cta_id_in_cluster: ID of the CTA within its cluster
+    :type cta_id_in_cluster: cute.Coord
+    :ivar _num_tiles_executed: Counter for executed tiles
+    :type _num_tiles_executed: Int32
+    :ivar _current_work_linear_idx: Current cluster index
+    :type _current_work_linear_idx: Int32
+    """
+
+    def __init__(
+        self,
+        params: PersistentTileSchedulerParams,
+        num_persistent_clusters: Int32,
+        current_work_linear_idx: Int32,
+        cta_id_in_cluster: cute.Coord,
+        num_tiles_executed: Int32,
+        inner_mode: int = 1,
+    ):
+        """Initializes the StaticPersistentRuntimeTileScheduler with the given parameters.
+
+        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
+        :type params: PersistentTileSchedulerParams
+        :param num_persistent_clusters: Number of persistent clusters that can be launched.
+        :type num_persistent_clusters: Int32
+        :param current_work_linear_idx: Current cluster index.
+        :type current_work_linear_idx: Int32
+        :param cta_id_in_cluster: ID of the CTA within its cluster.
+        :type cta_id_in_cluster: cute.Coord
+        :param num_tiles_executed: Counter for executed tiles.
+        :type num_tiles_executed: Int32
+        :param inner_mode: The inner mode along which the linear index will be decomposed first.
+        :type inner_mode: int
+        """
+        super().__init__(
+            params,
+            num_persistent_clusters,
+            current_work_linear_idx,
+            cta_id_in_cluster,
+            num_tiles_executed,
+        )
+        if inner_mode not in [0, 1]:
+            raise ValueError(f"inner_mode must be 0(for M mode) or 1(for N mode), but got {inner_mode}")
+        self.inner_mode = inner_mode
+
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "StaticPersistentRuntimeTileScheduler":
+        assert len(values) >= 6
+        new_num_persistent_clusters = new_from_mlir_values(self.num_persistent_clusters, [values[0]])
+        new_current_work_linear_idx = new_from_mlir_values(self._current_work_linear_idx, [values[1]])
+        new_cta_id_in_cluster = new_from_mlir_values(self.cta_id_in_cluster, values[2:5])
+        new_num_tiles_executed = new_from_mlir_values(self._num_tiles_executed, [values[5]])
+
+        # Reconstruct params with FastDivmod divisors (same as parent class)
+        params_values = values[6:]  # Remaining values are from params
+        new_params = new_from_mlir_values(self.params, params_values)
+
+        return StaticPersistentRuntimeTileScheduler(
+            new_params,  # Use reconstructed params with FastDivmod divisors
+            new_num_persistent_clusters,
+            new_current_work_linear_idx,
+            new_cta_id_in_cluster,
+            new_num_tiles_executed,
+            self.inner_mode,
+        )
+
+    @staticmethod
+    @dsl_user_op
+    def create(
+        params: PersistentTileSchedulerParams,
+        block_idx: Tuple[Integer, Integer, Integer],
+        grid_dim: Tuple[Integer, Integer, Integer],
+        inner_mode: int = 1,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """Initialize the static persistent tile scheduler.
+
+        :param params: Parameters for the persistent tile scheduler.
+        :type params: PersistentTileSchedulerParams
+        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
+        :type block_idx: Tuple[Integer, Integer, Integer]
+        :param grid_dim: The 3d grid dimensions for kernel launch.
+        :type grid_dim: Tuple[Integer, Integer, Integer]
+        :param inner_mode: The inner mode along which the linear index will be decomposed first.
+        :type inner_mode: int
+
+        :return: A StaticPersistentRuntimeTileScheduler object.
+        :rtype: StaticPersistentRuntimeTileScheduler
+        """
+
+        # Calculate the number of persistent clusters by dividing the total grid size
+        # by the number of CTAs per cluster
+        num_persistent_clusters = cute.size(grid_dim, loc=loc, ip=ip) // cute.size(params.cluster_shape_mn, loc=loc, ip=ip)
+
+        bidx, bidy, bidz = block_idx
+
+        # Initialize workload index equals to the cluster index in the grid
+        current_work_linear_idx = Int32(bidz)
+
+        # CTA id in the cluster
+        cta_id_in_cluster = (
+            Int32(bidx % params.cluster_shape_mn[0]),
+            Int32(bidy % params.cluster_shape_mn[1]),
+            Int32(0),
+        )
+        # Initialize number of tiles executed to zero
+        num_tiles_executed = Int32(0)
+        return StaticPersistentRuntimeTileScheduler(
+            params,
+            num_persistent_clusters,
+            current_work_linear_idx,
+            cta_id_in_cluster,
+            num_tiles_executed,
+            inner_mode,
+        )
+
+    # private method
+    def _get_current_work_for_linear_idx(self, current_work_linear_idx: Int32, *, loc=None, ip=None) -> WorkTileInfo:
+        """Compute current tile coord given current_work_linear_idx and cta_id_in_cluster.
+
+        :param current_work_linear_idx: The linear index of the current work.
+        :type current_work_linear_idx: Int32
+
+        :return: An object containing information about the current tile coordinates
+            and validity status.
+        :rtype: WorkTileInfo
+        """
+        ntile_shape = self.params.problem_layout_ncluster_mnl.shape
+        int_max = 2147483647
+        if const_expr(self.inner_mode == 1):
+            ntile_layout = cute.make_layout((int_max, ntile_shape[1]), stride=(ntile_shape[1], 1))
+        else:
+            ntile_layout = cute.make_layout((ntile_shape[0], int_max), stride=(1, ntile_shape[0]))
+        cluster_tile_coord_mn = ntile_layout.get_hier_coord(current_work_linear_idx)
+        cur_tile_coord = (
+            cluster_tile_coord_mn[0],
+            cluster_tile_coord_mn[1],
+            Int32(0),
+        )
+
+        # it is determined by kernel implementation
+        is_valid = True
+
+        return WorkTileInfo(cur_tile_coord, is_valid)
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/__init__.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/__init__.py
new file mode 100644
index 00000000..6d4f8220
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/__init__.py
@@ -0,0 +1,28 @@
+from .selection import SelectionAttention, selection_attention_wrapper
+from .compression import CompressionAttention, compression_attention_wrapper
+from .sliding_window_attention import (
+    SlidingWindowAttention,
+    sliding_window_attention_wrapper,
+)
+from .top_k import TopKReduction, topk_reduction_wrapper
+
+
+class NSANamespace:
+    SelectionAttention = staticmethod(SelectionAttention)
+    selection_attention_wrapper = staticmethod(selection_attention_wrapper)
+
+    SlidingWindowAttention = staticmethod(SlidingWindowAttention)
+    sliding_window_attention_wrapper = staticmethod(sliding_window_attention_wrapper)
+
+    CompressionAttention = staticmethod(CompressionAttention)
+    compression_attention_wrapper = staticmethod(compression_attention_wrapper)
+
+    TopKReduction = staticmethod(TopKReduction)
+    topk_reduction_wrapper = staticmethod(topk_reduction_wrapper)
+
+
+NSA = NSANamespace()
+
+__all__ = [
+    "NSA",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/__init__.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/__init__.py
new file mode 100644
index 00000000..b4478095
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/__init__.py
@@ -0,0 +1,6 @@
+from .api import CompressionAttention, compression_attention_wrapper
+
+__all__ = [
+    "CompressionAttention",
+    "compression_attention_wrapper",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/api.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/api.py
new file mode 100644
index 00000000..f38f4251
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/api.py
@@ -0,0 +1,520 @@
+from typing import Tuple, Optional
+import math
+
+from cuda.bindings import driver as cuda
+import torch
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+from cutlass.cute.typing import Int32
+
+from cudnn.api_base import APIBase
+from cudnn.datatypes import _convert_to_cutlass_data_type
+
+from ..utils import make_tensor_strided_like
+from .fmha import BlackwellFusedMultiHeadAttentionForward
+from . import fmha_helpers as fmha_utils
+
+
+class CompressionAttention(APIBase):
+    def __init__(
+        self,
+        sample_q: torch.Tensor,
+        sample_k: torch.Tensor,
+        sample_v: torch.Tensor,
+        sample_o: torch.Tensor,
+        sample_lse: Optional[torch.Tensor] = None,
+        sample_cum_seqlen_q: Optional[torch.Tensor] = None,
+        sample_cum_seqlen_k: Optional[torch.Tensor] = None,
+        qk_acc_dtype: torch.dtype = torch.float32,
+        pv_acc_dtype: torch.dtype = torch.float32,
+        mma_tiler_mn: Tuple[int, int] = (128, 128),
+        is_persistent: bool = False,
+        scale_q: float = 1.0,
+        scale_k: float = 1.0,
+        scale_v: float = 1.0,
+        inv_scale_o: float = 1.0,
+        scale_softmax: Optional[float] = None,
+    ):
+        super().__init__()
+        self._kernel = BlackwellFusedMultiHeadAttentionForward
+
+        self._logger.warning("CompressionAttention is an experimental API")
+        self._logger.debug("Entering __init__")
+
+        self.sample_q = sample_q
+        self.sample_k = sample_k
+        self.sample_v = sample_v
+        self.sample_o = sample_o
+        self.sample_lse = sample_lse
+        self.enable_lse = sample_lse is not None
+        self.sample_cum_seqlen_q = sample_cum_seqlen_q
+        self.sample_cum_seqlen_k = sample_cum_seqlen_k
+
+        # Types and kernel configuration
+        self.qk_acc_dtype_torch = qk_acc_dtype
+        self.pv_acc_dtype_torch = pv_acc_dtype
+        self.mma_tiler_mn = mma_tiler_mn
+        self.is_persistent = is_persistent
+
+        # Scale config
+        self.scale_q = scale_q
+        self.scale_k = scale_k
+        self.scale_v = scale_v
+        self.inv_scale_o = inv_scale_o
+        self.scale_softmax = scale_softmax
+
+        # Derived attributes (populated in check_support)
+        self.batch_size = None
+        self.s_q = None
+        self.s_k = None
+        self.h_q = None
+        self.h_k = None
+        self.h_r = None
+        self.head_dim = None
+        self.problem_size = None
+        self._compiled_kernel = None
+
+        self._logger.debug(
+            f"__init__ completed with args: sample_q {sample_q.shape}, sample_k {sample_k.shape}, sample_v {sample_v.shape}, sample_o {sample_o.shape}, sample_lse {sample_lse.shape if sample_lse is not None else 'None'}, sample_cum_seqlen_q {sample_cum_seqlen_q.shape if sample_cum_seqlen_q is not None else 'None'}, sample_cum_seqlen_k {sample_cum_seqlen_k.shape if sample_cum_seqlen_k is not None else 'None'}, qk_acc_dtype {qk_acc_dtype}, pv_acc_dtype {pv_acc_dtype}, mma_tiler_mn {mma_tiler_mn}, is_persistent {is_persistent}, scale_q {scale_q}, scale_k {scale_k}, scale_v {scale_v}, inv_scale_o {inv_scale_o}, scale_softmax {scale_softmax}"
+        )
+
+    def check_support(self) -> bool:
+        self._logger.debug("Entering check_support")
+
+        # shape normalization and validation
+        self._logger.debug("Checking shape normalization and validation")
+        if self.sample_q.ndim == 4:
+            self.input_layout = "B,H,S,D"
+
+            b, h_qo, s_qo, d_qk = self.sample_q.shape
+            b, h_kv, s_kv, d_qk = self.sample_k.shape
+            b, h_kv, s_kv, d_v = self.sample_v.shape
+            b, h_q, s_qo, d_v = self.sample_o.shape
+
+            if self.sample_q.shape != (b, h_qo, s_qo, d_qk):
+                raise ValueError(f"Input shape mismatch: expected Q tensor shape {b, h_qo, s_qo, d_qk}, got {self.sample_q.shape}")
+            if self.sample_k.shape != (b, h_kv, s_kv, d_qk):
+                raise ValueError(f"Input shape mismatch: expected K tensor shape {b, h_kv, s_kv, d_qk}, got {self.sample_k.shape}")
+            if self.sample_v.shape != (b, h_kv, s_kv, d_v):
+                raise ValueError(f"Input shape mismatch: expected V tensor shape {b, h_kv, s_kv, d_v}, got {self.sample_v.shape}")
+            if self.sample_o.shape != (b, h_q, s_qo, d_v):
+                raise ValueError(f"Output shape mismatch: expected O tensor shape {b, h_q, s_qo, d_v}, got {self.sample_o.shape}")
+            if self.enable_lse:
+                self.sample_lse = self._unpad_tensor_to_ndim(self.sample_lse, 3, "sample_lse")
+                if self.sample_lse.shape != (b, h_q, s_qo):
+                    raise ValueError(f"Output shape mismatch: expected LSE tensor shape {b, h_q, s_qo}, got {self.sample_lse.shape}")
+                if not self.sample_lse.is_contiguous():
+                    raise ValueError("LSE tensor must be contiguous")
+            if self.sample_cum_seqlen_q is not None or self.sample_cum_seqlen_k is not None:
+                self._logger.warning("sample_cum_seqlen_q and sample_cum_seqlen_k are ignored for B,H,S,D layout")
+
+            # Shapes
+            self.batch_size = b
+            self.s_q = s_qo
+            self.s_kv = s_kv
+            self.h_q = h_q
+            self.h_kv = h_kv
+            self.h_r = h_q // h_kv
+            self.head_dim = d_qk
+        elif self.sample_q.ndim == 3:
+            self.input_layout = "T,H,D"
+
+            t, h_q, d_qk = self.sample_q.shape
+            t_kv, h_kv, d_qk = self.sample_k.shape  # T has been compressed for K and V
+            t_kv, h_kv, d_v = self.sample_v.shape
+            t, h_q, d_v = self.sample_o.shape
+
+            if self.sample_q.shape != (t, h_q, d_qk):
+                raise ValueError(f"Input shape mismatch: expected Q tensor shape {t, h_q, d_qk}, got {self.sample_q.shape}")
+            if self.sample_k.shape != (t_kv, h_kv, d_qk):
+                raise ValueError(f"Input shape mismatch: expected K tensor shape {t_kv, h_kv, d_qk}, got {self.sample_k.shape}")
+            if self.sample_v.shape != (t_kv, h_kv, d_v):
+                raise ValueError(f"Input shape mismatch: expected V tensor shape {t_kv, h_kv, d_v}, got {self.sample_v.shape}")
+            if self.sample_o.shape != (t, h_q, d_v):
+                raise ValueError(f"Output shape mismatch: expected O tensor shape {t, h_q, d_v}, got {self.sample_o.shape}")
+            if self.enable_lse:
+                self.sample_lse = self._unpad_tensor_to_ndim(self.sample_lse, 2, "sample_lse")
+                if self.sample_lse.shape != (t, h_q):
+                    raise ValueError(f"Output shape mismatch: expected LSE tensor shape {t, h_q}, got {self.sample_lse.shape}")
+
+            if self.sample_cum_seqlen_q is None or self.sample_cum_seqlen_k is None:
+                raise ValueError(
+                    f"sample_cum_seqlen_q and sample_cum_seqlen_k must be provided for T,H,D layout, got {self.sample_cum_seqlen_q} and {self.sample_cum_seqlen_k}"
+                )
+            self.sample_cum_seqlen_q = self._unpad_tensor_to_ndim(self.sample_cum_seqlen_q, 1, "sample_cum_seqlen_q")
+            self.sample_cum_seqlen_k = self._unpad_tensor_to_ndim(self.sample_cum_seqlen_k, 1, "sample_cum_seqlen_k")
+            if self.sample_cum_seqlen_q.ndim != 1 or self.sample_cum_seqlen_k.ndim != 1:
+                raise ValueError(
+                    f"sample_cum_seqlen_q and sample_cum_seqlen_k must be 1D tensors, got {self.sample_cum_seqlen_q.ndim}D and {self.sample_cum_seqlen_k.ndim}D"
+                )
+            if self.sample_cum_seqlen_q.dtype not in {
+                torch.int32,
+                torch.int64,
+            } or self.sample_cum_seqlen_k.dtype not in {torch.int32, torch.int64}:
+                raise ValueError(
+                    f"sample_cum_seqlen_q and sample_cum_seqlen_k must be int32 or int64, got {self.sample_cum_seqlen_q.dtype} and {self.sample_cum_seqlen_k.dtype}"
+                )
+            if len(self.sample_cum_seqlen_q) != len(self.sample_cum_seqlen_k):
+                raise ValueError(
+                    f"sample_cum_seqlen_q and sample_cum_seqlen_k must have the same length, got {len(self.sample_cum_seqlen_q)} and {len(self.sample_cum_seqlen_k)}"
+                )
+
+            self.batch_size = len(self.sample_cum_seqlen_q) - 1
+            self.s_q = None
+            self.s_kv = None
+            self.h_q = h_q
+            self.h_kv = h_kv
+            self.h_r = h_q // h_kv
+            self.head_dim = d_qk
+
+        else:
+            raise ValueError(f"Invalid input layout: sample_q must be rank-3 (T,H,D) or rank-4 (B,H,S,D), got {self.sample_q.ndim}")
+        if d_qk != d_v:
+            raise ValueError("D_qk must match D_v")
+        if d_qk not in {32, 64, 128}:
+            raise ValueError("Head dimension D_qk must be 32, 64, or 128")
+        if h_q % h_kv != 0:
+            raise ValueError("H_q must be divisible by H_k (GQA/MQA constraint)")
+
+        self._logger.debug("Checking dtypes")
+        in_dtype = self.sample_q.dtype
+        out_dtype = self.sample_o.dtype
+        if self.sample_k.dtype != in_dtype or self.sample_v.dtype != in_dtype:
+            raise ValueError(f"Inputs must have the same dtype, got K {self.sample_k.dtype}, V {self.sample_v.dtype} for Q {in_dtype}")
+        if in_dtype not in {torch.float16, torch.bfloat16, torch.float8_e4m3fn}:
+            raise ValueError(f"Inputs must be Float16, BFloat16, or Float8E4M3FN, got {in_dtype}")
+        if out_dtype not in {torch.float16, torch.bfloat16, torch.float8_e4m3fn}:
+            raise ValueError(f"Outputs must be Float16, BFloat16, or Float8E4M3FN, got {out_dtype}")
+        if self.qk_acc_dtype_torch not in {torch.float32}:
+            raise ValueError(f"qk_acc_dtype must be Float32, got {self.qk_acc_dtype_torch}")
+        if self.pv_acc_dtype_torch not in {torch.float32}:
+            raise ValueError(f"pv_acc_dtype must be Float32, got {self.pv_acc_dtype_torch}")
+
+        # Scale defaults
+        if self.scale_softmax is None:
+            self._logger.debug("No scale_softmax provided, using default 1/sqrt(d)")
+            self.scale_softmax = 1.0 / math.sqrt(self.head_dim)
+
+        # Environment checks
+        self._logger.debug("Checking environment")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        device = torch.cuda.current_device()
+        major, minor = torch.cuda.get_device_capability(device)
+        compute_capability = major * 10 + minor
+        if compute_capability < 100:
+            raise RuntimeError(f"CompressionAttention requires SM100+ compute capability, but found SM{compute_capability} on device {device}")
+        if compute_capability == 103:
+            raise RuntimeError("cuteDSL is not supported on SM103")
+
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        self._logger.debug("Entering compile")
+        current_stream = self._get_default_stream(current_stream)
+        self._ensure_support_checked()
+
+        fmha_kernel = self._kernel(
+            _convert_to_cutlass_data_type(self.qk_acc_dtype_torch),
+            _convert_to_cutlass_data_type(self.pv_acc_dtype_torch),
+            (*self.mma_tiler_mn, self.head_dim),
+            self.is_persistent,
+            mask_type=fmha_utils.MaskType.COMPRESSED_CAUSAL_MASK,
+        )
+
+        # Scales
+        log2_e = math.log2(math.exp(1.0))
+        scale_softmax = self.scale_q * self.scale_k * self.scale_softmax
+        scale_softmax_log2 = scale_softmax * log2_e
+        scale_output = self.scale_v * self.inv_scale_o
+
+        s_q = self.s_q if self.input_layout == "B,H,S,D" else max(self.sample_cum_seqlen_q).item()
+        s_kv = self.s_kv if self.input_layout == "B,H,S,D" else max(self.sample_cum_seqlen_k).item()
+        self.problem_size = (
+            self.batch_size,
+            s_q,
+            s_q,
+            s_kv,
+            self.h_q,
+            self.h_kv,
+            self.head_dim,
+        )
+
+        self._logger.debug("Compiling CompressionAttention kernel with cute.compile")
+        self._compiled_kernel = cute.compile(
+            fmha_kernel,
+            q_iter=from_dlpack(self.sample_q, assumed_align=16).iterator,
+            q_stride=(self.sample_q.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (self.sample_q.stride()[0], *self.sample_q.stride())),
+            k_iter=from_dlpack(self.sample_k, assumed_align=16).iterator,
+            k_stride=(self.sample_k.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (self.sample_k.stride()[0], *self.sample_k.stride())),
+            v_iter=from_dlpack(self.sample_v, assumed_align=16).iterator,
+            v_stride=(self.sample_v.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (self.sample_v.stride()[0], *self.sample_v.stride())),
+            o_iter=from_dlpack(self.sample_o, assumed_align=16).iterator,
+            o_stride=(self.sample_o.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (self.sample_o.stride()[0], *self.sample_o.stride())),
+            problem_size=self.problem_size,
+            cum_seqlen_q=(from_dlpack(self.sample_cum_seqlen_q, assumed_align=16) if self.input_layout == "T,H,D" else None),
+            cum_seqlen_k=(from_dlpack(self.sample_cum_seqlen_k, assumed_align=16) if self.input_layout == "T,H,D" else None),
+            lse_iter=(from_dlpack(self.sample_lse, assumed_align=16).iterator if self.enable_lse else None),
+            lse_stride=(self.sample_lse.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (0, *self.sample_lse.stride())),
+            scale_softmax_log2=scale_softmax_log2,
+            scale_softmax=scale_softmax,
+            scale_output=scale_output,
+            window_size_left=None,
+            window_size_right=Int32(0),
+            stream=current_stream,
+        )
+        self._logger.debug("Kernel compiled successfully")
+
+    def execute(
+        self,
+        q_tensor: torch.Tensor,
+        k_tensor: torch.Tensor,
+        v_tensor: torch.Tensor,
+        o_tensor: torch.Tensor,
+        lse_tensor: Optional[torch.Tensor] = None,
+        cum_seqlen_q_tensor: Optional[torch.Tensor] = None,
+        cum_seqlen_k_tensor: Optional[torch.Tensor] = None,
+        current_stream: Optional[cuda.CUstream] = None,
+        skip_compile: bool = False,
+        scale_q: Optional[float] = None,
+        scale_k: Optional[float] = None,
+        scale_v: Optional[float] = None,
+        inv_scale_o: Optional[float] = None,
+        scale_softmax: Optional[float] = None,
+    ) -> None:
+        self._logger.debug("Entering execute")
+        current_stream = self._get_default_stream(current_stream)
+
+        if self.enable_lse:
+            if lse_tensor is None:
+                raise ValueError("kernel was compiled with lse_tensor provided, but lse_tensor was not provided during execute")
+            lse_tensor = self._unpad_tensor_to_ndim(lse_tensor, o_tensor.ndim - 1, "lse_tensor")
+        if self.input_layout == "T,H,D":
+            if cum_seqlen_q_tensor is None or cum_seqlen_k_tensor is None:
+                raise ValueError(
+                    f"cum_seqlen_q_tensor and cum_seqlen_k_tensor must be provided for T,H,D layout, got {cum_seqlen_q_tensor} and {cum_seqlen_k_tensor}"
+                )
+            cum_seqlen_q_tensor = self._unpad_tensor_to_ndim(cum_seqlen_q_tensor, 1, "cum_seqlen_q_tensor")
+            cum_seqlen_k_tensor = self._unpad_tensor_to_ndim(cum_seqlen_k_tensor, 1, "cum_seqlen_k_tensor")
+
+        # Scale values
+        scale_q = self.scale_q if scale_q is None else scale_q
+        scale_k = self.scale_k if scale_k is None else scale_k
+        scale_v = self.scale_v if scale_v is None else scale_v
+        inv_scale_o = self.inv_scale_o if inv_scale_o is None else inv_scale_o
+        scale_softmax = self.scale_softmax if scale_softmax is None else scale_softmax
+        log2_e = math.log2(math.e)
+        scale_softmax_val = scale_q * scale_k * scale_softmax
+        scale_softmax_log2_val = scale_softmax_val * log2_e
+        scale_output_val = scale_v * inv_scale_o
+
+        if not skip_compile:
+            if self._compiled_kernel is None:
+                raise ValueError("CompressionAttention kernel not compiled")
+            self._logger.debug("Executing with compiled kernel")
+            self._compiled_kernel(
+                q_iter=from_dlpack(
+                    (q_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else q_tensor),
+                    assumed_align=16,
+                ).iterator,
+                k_iter=from_dlpack(
+                    (k_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else k_tensor),
+                    assumed_align=16,
+                ).iterator,
+                v_iter=from_dlpack(
+                    (v_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else v_tensor),
+                    assumed_align=16,
+                ).iterator,
+                o_iter=from_dlpack(
+                    (o_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else o_tensor),
+                    assumed_align=16,
+                ).iterator,
+                problem_size=self.problem_size,
+                cum_seqlen_q=(from_dlpack(cum_seqlen_q_tensor, assumed_align=16).iterator if self.input_layout == "T,H,D" else None),
+                cum_seqlen_k=(from_dlpack(cum_seqlen_k_tensor, assumed_align=16).iterator if self.input_layout == "T,H,D" else None),
+                lse_iter=(from_dlpack(lse_tensor, assumed_align=16).iterator if self.enable_lse else None),
+                scale_softmax_log2=scale_softmax_log2_val,
+                scale_softmax=scale_softmax_val,
+                scale_output=scale_output_val,
+                window_size_left=None,
+                window_size_right=Int32(0),
+                stream=current_stream,
+            )
+            self._logger.debug("Executed with compiled kernel successfully")
+        else:
+            self._logger.debug("Executing without compiled kernel (JIT)")
+            fmha_kernel = self._kernel(
+                _convert_to_cutlass_data_type(self.qk_acc_dtype_torch),
+                _convert_to_cutlass_data_type(self.pv_acc_dtype_torch),
+                (*self.mma_tiler_mn, self.head_dim),
+                self.is_persistent,
+                mask_type=fmha_utils.MaskType.COMPRESSED_CAUSAL_MASK,
+            )
+            fmha_kernel(
+                q_iter=from_dlpack(
+                    (q_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else q_tensor),
+                    assumed_align=16,
+                ).iterator,
+                q_stride=(q_tensor.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (q_tensor.stride()[0], *q_tensor.stride())),
+                k_iter=from_dlpack(
+                    (k_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else k_tensor),
+                    assumed_align=16,
+                ).iterator,
+                k_stride=(k_tensor.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (k_tensor.stride()[0], *k_tensor.stride())),
+                v_iter=from_dlpack(
+                    (v_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else v_tensor),
+                    assumed_align=16,
+                ).iterator,
+                v_stride=(v_tensor.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (v_tensor.stride()[0], *v_tensor.stride())),
+                o_iter=from_dlpack(
+                    (o_tensor.transpose(1, 2) if self.input_layout == "B,H,S,D" else o_tensor),
+                    assumed_align=16,
+                ).iterator,
+                o_stride=(o_tensor.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (o_tensor.stride()[0], *o_tensor.stride())),
+                problem_size=self.problem_size,
+                cum_seqlen_q=(from_dlpack(cum_seqlen_q_tensor, assumed_align=16) if self.input_layout == "T,H,D" else None),
+                cum_seqlen_k=(from_dlpack(cum_seqlen_k_tensor, assumed_align=16) if self.input_layout == "T,H,D" else None),
+                lse_iter=(from_dlpack(lse_tensor, assumed_align=16).iterator if self.enable_lse else None),
+                lse_stride=(lse_tensor.transpose(1, 2).stride() if self.input_layout == "B,H,S,D" else (0, *lse_tensor.stride())),
+                scale_softmax_log2=scale_softmax_log2_val,
+                scale_softmax=scale_softmax_val,
+                scale_output=scale_output_val,
+                window_size_left=None,
+                window_size_right=Int32(0),
+                stream=current_stream,
+            )
+            self._logger.debug("Executed successfully")
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_CompressionAttentionObjects = {}
+
+
+def compression_attention_wrapper(
+    q_tensor: torch.Tensor,
+    k_tensor: torch.Tensor,
+    v_tensor: torch.Tensor,
+    cum_seqlen_q_tensor: Optional[torch.Tensor] = None,
+    cum_seqlen_k_tensor: Optional[torch.Tensor] = None,
+    enable_lse: bool = False,
+    o_dtype: Optional[torch.dtype] = None,
+    qk_acc_dtype: torch.dtype = torch.float32,
+    pv_acc_dtype: torch.dtype = torch.float32,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    is_persistent: bool = False,
+    scale_q: float = 1.0,
+    scale_k: float = 1.0,
+    scale_v: float = 1.0,
+    inv_scale_o: float = 1.0,
+    scale_softmax: Optional[float] = None,
+    stream: Optional[cuda.CUstream] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Compression Attention Wrapper that returns output (and optionally LSE) tensors directly.
+
+    Returns:
+        tuple: (o_tensor, lse_tensor | None)
+    """
+    _logger.debug("compression_attention_wrapper: Creating empty output tensor o and optional lse")
+
+    o_tensor, lse_tensor = None, None
+    o_dtype = o_dtype if o_dtype is not None else q_tensor.dtype
+    if q_tensor.ndim == 4:  # bshd
+        b, h_q, s_q, d = q_tensor.shape
+        _, h_k, s_k, d_v = v_tensor.shape
+
+        o_tensor = make_tensor_strided_like(q_tensor, (b, h_q, s_q, d_v), dtype=o_dtype, device=q_tensor.device)
+        if enable_lse:
+            lse_tensor = torch.empty(b, h_q, s_q, dtype=torch.float32, device=q_tensor.device).contiguous()
+    elif q_tensor.ndim == 3:  # thd
+        t, h_q, d = q_tensor.shape
+        _, h_k, d_v = v_tensor.shape
+
+        o_tensor = make_tensor_strided_like(q_tensor, (t, h_q, d_v), dtype=o_dtype, device=q_tensor.device)
+        if enable_lse:
+            lse_tensor = torch.empty(1, h_q, t, dtype=torch.float32, device=q_tensor.device).contiguous().permute(2, 1, 0)
+    else:
+        raise ValueError(f"Invalid input layout: q_tensor must be rank-4 (B,H,S,D) or rank-3 (T,H,D), got {q_tensor.ndim}")
+
+    cache_key = (
+        q_tensor.shape,
+        k_tensor.shape,
+        v_tensor.shape,
+        cum_seqlen_q_tensor.shape if cum_seqlen_q_tensor is not None else None,
+        cum_seqlen_k_tensor.shape if cum_seqlen_k_tensor is not None else None,
+        q_tensor.dtype,
+        k_tensor.dtype,
+        v_tensor.dtype,
+        cum_seqlen_q_tensor.dtype if cum_seqlen_q_tensor is not None else None,
+        cum_seqlen_k_tensor.dtype if cum_seqlen_k_tensor is not None else None,
+        q_tensor.stride(),
+        k_tensor.stride(),
+        v_tensor.stride(),
+        cum_seqlen_q_tensor.stride() if cum_seqlen_q_tensor is not None else None,
+        cum_seqlen_k_tensor.stride() if cum_seqlen_k_tensor is not None else None,
+        enable_lse,
+        o_dtype,
+        qk_acc_dtype,
+        pv_acc_dtype,
+        mma_tiler_mn,
+        is_persistent,
+        scale_q,
+        scale_k,
+        scale_v,
+        inv_scale_o,
+        scale_softmax,
+    )
+    if cache_key in _cache_of_CompressionAttentionObjects:
+        _logger.debug("compression_attention_wrapper: Using previously cached CompressionAttention object")
+        comp_attn = _cache_of_CompressionAttentionObjects[cache_key]
+        comp_attn.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            v_tensor=v_tensor,
+            o_tensor=o_tensor,
+            lse_tensor=lse_tensor,
+            cum_seqlen_q_tensor=cum_seqlen_q_tensor,
+            cum_seqlen_k_tensor=cum_seqlen_k_tensor,
+            current_stream=stream,
+        )
+    else:
+        _logger.debug("compression_attention_wrapper: No cached object found, creating new CompressionAttention object")
+        comp_attn = CompressionAttention(
+            sample_q=q_tensor,
+            sample_k=k_tensor,
+            sample_v=v_tensor,
+            sample_o=o_tensor,
+            sample_lse=lse_tensor,
+            sample_cum_seqlen_q=cum_seqlen_q_tensor,
+            sample_cum_seqlen_k=cum_seqlen_k_tensor,
+            qk_acc_dtype=qk_acc_dtype,
+            pv_acc_dtype=pv_acc_dtype,
+            mma_tiler_mn=mma_tiler_mn,
+            is_persistent=is_persistent,
+            scale_q=scale_q,
+            scale_k=scale_k,
+            scale_v=scale_v,
+            inv_scale_o=inv_scale_o,
+            scale_softmax=scale_softmax,
+        )
+        assert comp_attn.check_support()
+        comp_attn.compile()
+        comp_attn.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            v_tensor=v_tensor,
+            o_tensor=o_tensor,
+            lse_tensor=lse_tensor,
+            cum_seqlen_q_tensor=cum_seqlen_q_tensor,
+            cum_seqlen_k_tensor=cum_seqlen_k_tensor,
+            current_stream=stream,
+        )
+        _cache_of_CompressionAttentionObjects[cache_key] = comp_attn
+
+    return o_tensor, lse_tensor
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/fmha.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/fmha.py
new file mode 100644
index 00000000..8eb72709
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/fmha.py
@@ -0,0 +1,2259 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import math
+import os
+import sys
+import time
+from typing import Type, Tuple, Union, Optional
+
+import torch
+import torch.nn.functional as F
+import cuda.bindings.driver as cuda
+
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.cute.nvgpu.tcgen05 as tcgen05
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.cute.testing as testing
+from cutlass.cute.runtime import from_dlpack
+from cutlass.cute.typing import Int32, Int64, Float32
+
+from cudnn.native_sparse_attention.compression import fmha_helpers as fmha_utils
+
+"""
+A fused multi-head attention (FMHA) example for the NVIDIA Blackwell SM100 architecture using CUTE DSL
+
+This example demonstrates an implementation of fused multi-head attention using a TMA + Blackwell SM100
+TensorCore warp-specialized persistent kernel. The implementation integrates the Q*K^T matrix multiplication,
+softmax normalization, and softmax(Q*K^T)*V into a single kernel, avoiding intermediate data movement between
+global memory and shared memory, thus improving computational efficiency.
+
+The kernel implements key optimizations including:
+- Warp specialization for different computation phases (load, MMA, softmax, correction, epilogue)
+- Pipeline stages between different warps for overlapping computation and memory access
+- Support for different precision data types
+- Optional causal masking for autoregressive models
+
+To run this example:
+
+.. code-block:: bash
+
+    python examples/blackwell/fmha.py                                     \
+      --qk_acc_dtype Float32 --pv_acc_dtype Float32                       \
+      --mma_tiler_mn 128,128                                              \
+      --q_shape 4,1024,8,64 --k_shape 4,1024,8,64                         \
+      --is_persistent
+
+The above example runs FMHA with batch size 4, sequence length 1024, 8 attention heads, and head
+dimension 64. The Blackwell tcgen05 MMA tile shape is (128, 128), and the kernel uses fp16 for input/output
+with fp32 for accumulation.
+
+To collect performance with NCU profiler:
+
+.. code-block:: bash
+
+    ncu python examples/blackwell/fmha.py                                 \
+      --qk_acc_dtype Float32 --pv_acc_dtype Float32                       \
+      --mma_tiler_mn 128,128                                              \
+      --q_shape 4,1024,8,64 --k_shape 4,1024,8,64                         \
+      --is_persistent --warmup_iterations 10                              \
+      --iterations 10 --skip_ref_check
+
+Constraints for this example:
+* Supported head dimensions: 32, 64, and 128
+* Number of heads in Q must be divisible by number of heads in K
+* mma_tiler_mn must be 128,128
+* Batch size must be the same for Q, K, and V tensors
+* For causal masking, use --is_causal (note: specify without =True/False)
+* For persistent scheduling, use --is_persistent (note: specify without =True/False)
+"""
+
+
+def make_thread_cooperative_group(size: int):
+    return pipeline.CooperativeGroup(pipeline.Agent.Thread, size)
+
+
+class BlackwellFusedMultiHeadAttentionForward:
+    def __init__(
+        self,
+        qk_acc_dtype: Type[cutlass.Numeric],
+        pv_acc_dtype: Type[cutlass.Numeric],
+        mma_tiler: Tuple[int, int, int],
+        is_persistent: bool,
+        mask_type: fmha_utils.MaskType,
+    ):
+        """Initializes the configuration for a Blackwell Fused Multi-Head Attention (FMHA) kernel.
+
+        This configuration includes several key aspects:
+
+        1.  Data Type Settings:
+            - qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator
+            - pv_acc_dtype: Data type for P*V matrix multiplication accumulator
+
+        2.  MMA Instruction Settings:
+            - mma_tiler: The (M, N, K) shape of the MMA instruction unit
+            - qk_mma_tiler: MMA shape for Q*K^T computation
+            - pv_mma_tiler: MMA shape for P*V computation
+
+        3.  Kernel Execution Mode:
+            - is_persistent: Boolean indicating whether to use persistent kernel mode
+            - mask_type: Specifies the type of mask to use (no mask, residual mask, or causal mask)
+            - window_size_left/right: Sliding window size for attention masking
+
+        :param qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator
+        :type qk_acc_dtype: Type[cutlass.Numeric]
+        :param pv_acc_dtype: Data type for P*V matrix multiplication accumulator
+        :type pv_acc_dtype: Type[cutlass.Numeric]
+        :param mma_tiler: The (M, N, K) shape of the MMA instruction
+        :type mma_tiler: Tuple[int, int, int]
+        :param is_persistent: Whether to use persistent kernel mode
+        :type is_persistent: bool
+        :param mask_type: Type of mask to use
+        :type mask_type: fmha_utils.MaskType
+        :param window_size_left: Left-side sliding window size for attention masking
+        :type window_size_left: int
+        :param window_size_right: Right-side sliding window size for attention masking
+        :type window_size_right: int
+        """
+
+        self.qk_acc_dtype = qk_acc_dtype
+        self.pv_acc_dtype = pv_acc_dtype
+        self.cta_tiler = (
+            2 * mma_tiler[0],  # 2 Q tile per CTA
+            mma_tiler[1],
+            mma_tiler[2],
+        )
+        self.qk_mma_tiler = mma_tiler
+        self.pv_mma_tiler = (
+            mma_tiler[0],
+            mma_tiler[2],
+            mma_tiler[1],
+        )
+        self.cluster_shape_mn = (1, 1)
+        self.is_persistent = is_persistent
+        self.mask_type = mask_type
+
+        self.softmax0_warp_ids = (0, 1, 2, 3)
+        self.softmax1_warp_ids = (4, 5, 6, 7)
+        self.correction_warp_ids = (8, 9, 10, 11)
+        self.mma_warp_id = 12
+        self.load_warp_id = 13
+        self.epilogue_warp_id = 14
+        self.empty_warp_id = 15
+        SM100_TMEM_CAPACITY_COLUMNS = 512
+        self.tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
+
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.softmax0_warp_ids,
+                *self.softmax1_warp_ids,
+                *self.correction_warp_ids,
+                self.mma_warp_id,
+                self.load_warp_id,
+                self.epilogue_warp_id,
+                self.empty_warp_id,
+            )
+        )
+
+        self.cta_sync_bar_id = 0
+        self.tmem_alloc_sync_bar_id = 1
+
+        self.tmem_s0_offset = 0
+        self.tmem_s1_offset = 128
+        self.tmem_o0_offset = 256
+        self.tmem_o1_offset = 384
+        self.tmem_p0_offset = 32
+        self.tmem_p1_offset = 160
+
+        # vec buffer for row_max & row_sum
+        self.tmem_vec0_offset = 0
+        self.tmem_vec1_offset = 128
+
+        self.num_regs_softmax = 192
+        self.num_regs_correction = 96
+        self.num_regs_other = 32
+
+        self.buffer_align_bytes = 1024
+
+        num_warps_per_warpgroup = 4
+        self.softmax_warpgroup_count = len((*self.softmax0_warp_ids, *self.softmax1_warp_ids)) // num_warps_per_warpgroup
+
+    def _setup_attributes(self):
+        """Set up configurations and parameters for the FMHA kernel operation.
+
+        This method initializes and configures various attributes required for the
+        execution of the fused multi-head attention kernel, mainly about the pipeline stages:
+
+        - Sets up staging parameters for Q, K, V inputs and accumulator data
+        - Configures pipeline stages for softmax, correction, and epilogue operations
+        """
+
+        self.q_stage = 2
+        self.kv_stage = 4 if self.q_dtype.width == 8 else 3
+        self.acc_stage = 1
+        self.softmax_corr_stage = 1
+        self.mma_corr_stage = 2
+        self.mma_softmax_stage = 1
+        self.epi_stage = 2
+
+    @cute.jit
+    def __call__(
+        self,
+        q_iter: cute.Pointer,
+        q_stride: cutlass.Constexpr[[Tuple[int, int, int, int]]],
+        k_iter: cute.Pointer,
+        k_stride: cutlass.Constexpr[Tuple[int, int, int, int]],
+        v_iter: cute.Pointer,
+        v_stride: cutlass.Constexpr[Tuple[int, int, int, int]],
+        o_iter: cute.Pointer,
+        o_stride: cutlass.Constexpr[Tuple[int, int, int, int]],
+        problem_size: Tuple[Int32, Int32, Int32, Int32, Int32, Int32],
+        cum_seqlen_q: Optional[cute.Tensor],
+        cum_seqlen_k: Optional[cute.Tensor],
+        lse_iter: Optional[cute.Pointer],
+        lse_stride: cutlass.Constexpr[Tuple[int, int, int]],
+        scale_softmax_log2: Float32,
+        scale_softmax: Float32,
+        scale_output: Float32,
+        window_size_left: Optional[Int32],
+        window_size_right: Optional[Int32],
+        stream: cuda.CUstream,
+    ):
+        """Execute the Fused Multi-Head Attention operation on the provided tensors.
+
+        This method prepares the input tensors for processing, validates their shapes and types,
+        configures the computation parameters, and launches the CUDA kernel.
+
+        The method handles:
+        1. Tensor layout transformations for specific memory access patterns
+        2. Validation of tensor shapes and data types
+        3. Initialization of hardware-specific parameters and memory layouts
+        4. Configuration of TMA (Tensor Memory Access) operations
+        5. Grid and work scheduling computation
+        6. Kernel launch with appropriate parameters
+
+        :param q_iter: The query tensor pointer
+        :type q_iter: cute.Pointer
+        :param q_stride: The stride of the query tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
+        :type q_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
+        :param k_iter: The key tensor pointer
+        :type k_iter: cute.Pointer
+        :param k_stride: The stride of the key tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
+        :type k_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
+        :param v_iter: The value tensor pointer
+        :type v_iter: cute.Pointer
+        :param v_stride: The stride of the value tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
+        :type v_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
+        :param o_iter: The output tensor pointer
+        :type o_iter: cute.Pointer
+        :param o_stride: The stride of the output tensor. (B, S, H, D) for bshd, (T, T, H, D) for thd (note that the T stride is duplicated)
+        :type o_stride: cutlass.Constexpr[Tuple[int, int, int, int]]
+        :param problem_size: The problem size with shape [b, s_q, s_lse, s_k, h_q, h_k, d]. If cum_seqlen_q or cum_seqlen_k is not None, s_q and s_k are the max of the cumulative sequence length respectively.
+        :type problem_size: Tuple[Int32, Int32, Int32, Int32, Int32, Int32]
+        :param cum_seqlen_q: The cumulative sequence length tensor for query
+        :type cum_seqlen_q: Optional[cute.Tensor]
+        :param cum_seqlen_k: The cumulative sequence length tensor for key
+        :type cum_seqlen_k: Optional[cute.Tensor]
+        :param lse_stride: The stride of the log-sum-exp tensor. (B, S, H) for bshd, (0, T, H) for thd
+        :type lse_stride: cutlass.Constexpr[Tuple[int, int, int]]
+        :param scale_softmax_log2: The log2 scale factor for softmax
+        :type scale_softmax_log2: Float32
+        :param scale_softmax: The scale factor for softmax
+        :type scale_softmax: Float32
+        :param scale_output: The scale factor for the output
+        :type scale_output: Float32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+        :param stream: The CUDA stream to execute the kernel on
+        :type stream: cuda.CUstream
+        :raises TypeError: If tensor data types don't match or aren't supported
+        :raises RuntimeError: If tensor layouts aren't in supported formats
+        """
+        b, s_q, s_lse, s_k, h_q, h_k, d = problem_size
+        h_r = h_q // h_k
+        b_qo = b if cum_seqlen_q is None else s_q * (1 + b)
+        b_kv = b if cum_seqlen_k is None else s_k * (1 + b)
+        b_lse = b if cum_seqlen_q is None else 1
+
+        # (s, d, ((h_r, h_k), b))
+        q_layout = cute.make_layout(
+            (s_q, d, ((h_r, h_k), b_qo)),
+            stride=(
+                q_stride[1],
+                q_stride[3],
+                ((q_stride[2], q_stride[2] * h_r), q_stride[0]),
+            ),
+        )
+        q_offset = 0 if cum_seqlen_q is None else -s_q * q_stride[1]
+        q = cute.make_tensor(q_iter + q_offset, q_layout)
+        # (s, d, ((h_r, h_k), b)), 0-stride for h_r to broadcast
+        k_layout = cute.make_layout(
+            (s_k, d, ((h_r, h_k), b_kv)),
+            stride=(k_stride[1], k_stride[3], ((0, k_stride[2]), k_stride[0])),
+        )
+        k_offset = 0 if cum_seqlen_k is None else -s_k * k_stride[1]
+        k = cute.make_tensor(k_iter + k_offset, k_layout)
+        # (d, s, ((h_r, h_k), b)), 0-stride for h_r to broadcast
+        v_layout = cute.make_layout(
+            (d, s_k, ((h_r, h_k), b_kv)),
+            stride=(1, v_stride[1], ((0, v_stride[2]), v_stride[0])),
+        )
+        v_offset = 0 if cum_seqlen_k is None else -s_k * v_stride[1]
+        v = cute.make_tensor(v_iter + v_offset, v_layout)
+        # (s, d, ((h_r, h_k), b))
+        o_offset = 0 if cum_seqlen_q is None else -s_q * o_stride[1]
+        o_layout = cute.make_layout(
+            (s_q, d, ((h_r, h_k), b_qo)),
+            stride=(
+                o_stride[1],
+                o_stride[3],
+                ((o_stride[2], o_stride[2] * h_r), o_stride[0]),
+            ),
+        )
+        o = cute.make_tensor(o_iter + o_offset, o_layout)
+        if cutlass.const_expr(lse_iter is not None):
+            # (s, ((h_r, h_k), b))
+            lse_layout = cute.make_layout(
+                (s_lse, ((h_r, h_k), b_lse)),
+                stride=(
+                    lse_stride[1],
+                    ((lse_stride[2], h_r * lse_stride[2]), lse_stride[0]),
+                ),
+            )
+            lse = cute.make_tensor(lse_iter, lse_layout)
+        else:
+            lse = None
+
+        # setup static attributes before smem/grid/tma computation
+        self.q_dtype = q.element_type
+        self.k_dtype = k.element_type
+        self.v_dtype = v.element_type
+        self.o_dtype = o.element_type
+
+        self.tile_sched_params, grid = fmha_utils.compute_grid(
+            cute.shape((s_q, d, ((h_r, h_k), b))),
+            self.cta_tiler,
+            self.is_persistent,
+        )
+
+        self.q_major_mode = utils.LayoutEnum.from_tensor(q).mma_major_mode()
+        self.k_major_mode = utils.LayoutEnum.from_tensor(k).mma_major_mode()
+        self.v_major_mode = utils.LayoutEnum.from_tensor(v).mma_major_mode()
+        self.o_layout = utils.LayoutEnum.from_tensor(o)
+
+        if cutlass.const_expr(self.q_major_mode != tcgen05.OperandMajorMode.K):
+            raise RuntimeError("The layout of q is not supported")
+        if cutlass.const_expr(self.k_major_mode != tcgen05.OperandMajorMode.K):
+            raise RuntimeError("The layout of k is not supported")
+        if cutlass.const_expr(self.v_major_mode != tcgen05.OperandMajorMode.MN):
+            raise RuntimeError("The layout of v is not supported")
+
+        # check type consistency
+        if cutlass.const_expr(self.q_dtype != self.k_dtype):
+            raise TypeError(f"Type mismatch: {self.q_dtype} != {self.k_dtype}")
+        if cutlass.const_expr(self.q_dtype != self.v_dtype):
+            raise TypeError(f"Type mismatch: {self.q_dtype} != {self.v_dtype}")
+        self._setup_attributes()
+
+        cta_group = tcgen05.CtaGroup.ONE
+        # the intermediate tensor p is from tmem & k-major
+        p_source = tcgen05.OperandSource.TMEM
+        p_major_mode = tcgen05.OperandMajorMode.K
+        qk_tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.q_dtype,
+            self.q_major_mode,
+            self.k_major_mode,
+            self.qk_acc_dtype,
+            cta_group,
+            self.qk_mma_tiler[:2],
+        )
+        pv_tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.v_dtype,
+            p_major_mode,
+            self.v_major_mode,
+            self.pv_acc_dtype,
+            cta_group,
+            self.pv_mma_tiler[:2],
+            p_source,
+        )
+
+        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout(self.cluster_shape_mnk),
+            (qk_tiled_mma.thr_id.shape,),
+        )
+
+        self.epi_tile = self.pv_mma_tiler[:2]
+
+        q_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            qk_tiled_mma,
+            self.qk_mma_tiler,
+            self.q_dtype,
+            self.q_stage,
+        )
+        k_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            qk_tiled_mma,
+            self.qk_mma_tiler,
+            self.k_dtype,
+            self.kv_stage,
+        )
+        p_tmem_layout_staged = sm100_utils.make_smem_layout_a(
+            pv_tiled_mma,
+            self.pv_mma_tiler,
+            self.q_dtype,
+            self.acc_stage,
+        )
+        v_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            pv_tiled_mma,
+            self.pv_mma_tiler,
+            self.v_dtype,
+            self.kv_stage,
+        )
+        o_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.o_dtype,
+            self.o_layout,
+            self.epi_tile,
+            self.epi_stage,
+        )
+
+        # TMA load for Q
+        tma_load_op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(cta_group)
+        tma_store_op = cute.nvgpu.cpasync.CopyBulkTensorTileS2GOp()
+
+        q_smem_layout = cute.select(q_smem_layout_staged, mode=[0, 1, 2])
+        tma_atom_q, tma_tensor_q = cute.nvgpu.make_tiled_tma_atom_A(
+            tma_load_op,
+            q,
+            q_smem_layout,
+            self.qk_mma_tiler,
+            qk_tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # TMA load for K
+        k_smem_layout = cute.select(k_smem_layout_staged, mode=[0, 1, 2])
+        tma_atom_k, tma_tensor_k = cute.nvgpu.make_tiled_tma_atom_B(
+            tma_load_op,
+            k,
+            k_smem_layout,
+            self.qk_mma_tiler,
+            qk_tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+        # TMA load for V
+        v_smem_layout = cute.select(v_smem_layout_staged, mode=[0, 1, 2])
+        tma_atom_v, tma_tensor_v = cute.nvgpu.make_tiled_tma_atom_B(
+            tma_load_op,
+            v,
+            v_smem_layout,
+            self.pv_mma_tiler,
+            pv_tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        o_smem_layout = cute.select(o_smem_layout_staged, mode=[0, 1])
+
+        tma_atom_o, tma_tensor_o = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            tma_store_op,
+            o,
+            o_smem_layout,
+            self.epi_tile,
+        )
+
+        q_copy_size = cute.size_in_bytes(self.q_dtype, q_smem_layout)
+        k_copy_size = cute.size_in_bytes(self.k_dtype, k_smem_layout)
+        self.tma_copy_q_bytes = q_copy_size
+        self.tma_copy_kv_bytes = k_copy_size
+
+        @cute.struct
+        class SharedStorage:
+            # Pipeline barriers
+            load_q_mbar_ptr: cute.struct.MemRange[Int64, self.q_stage * 2]
+            load_kv_mbar_ptr: cute.struct.MemRange[Int64, self.kv_stage * 2]
+            mma_s0_mbar_ptr: cute.struct.MemRange[Int64, self.mma_softmax_stage * 2]
+            mma_s1_mbar_ptr: cute.struct.MemRange[Int64, self.mma_softmax_stage * 2]
+            s0_corr_mbar_ptr: cute.struct.MemRange[Int64, self.softmax_corr_stage * 2]
+            s1_corr_mbar_ptr: cute.struct.MemRange[Int64, self.softmax_corr_stage * 2]
+            s0_s1_sequence_mbar_ptr: cute.struct.MemRange[Int64, self.softmax_warpgroup_count]
+            corr_epi_mbar_ptr: cute.struct.MemRange[Int64, self.epi_stage * 2]
+            mma_corr_mbar_ptr: cute.struct.MemRange[Int64, self.mma_corr_stage * 2]
+            tmem_dealloc_mbar_ptr: cute.struct.MemRange[Int64, 1]
+            # Tmem holding buffer
+            tmem_holding_buf: Int32
+            # Smem tensors
+            sO: cute.struct.Align[
+                cute.struct.MemRange[self.o_dtype, cute.cosize(o_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sQ: cute.struct.Align[
+                cute.struct.MemRange[self.q_dtype, cute.cosize(q_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sK: cute.struct.Align[
+                cute.struct.MemRange[self.k_dtype, cute.cosize(k_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            qk_tiled_mma,
+            pv_tiled_mma,
+            tma_atom_q,
+            tma_tensor_q,
+            tma_atom_k,
+            tma_tensor_k,
+            tma_atom_v,
+            tma_tensor_v,
+            tma_atom_o,
+            tma_tensor_o,
+            cum_seqlen_q,
+            cum_seqlen_k,
+            lse,
+            scale_softmax_log2,
+            scale_softmax,
+            scale_output,
+            window_size_left,
+            window_size_right,
+            q_smem_layout_staged,
+            k_smem_layout_staged,
+            p_tmem_layout_staged,
+            v_smem_layout_staged,
+            o_smem_layout_staged,
+            self.tile_sched_params,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=self.cluster_shape_mnk,
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+
+    #  GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        qk_tiled_mma: cute.TiledMma,
+        pv_tiled_mma: cute.TiledMma,
+        tma_atom_q: cute.CopyAtom,
+        mQ_qdl: cute.Tensor,
+        tma_atom_k: cute.CopyAtom,
+        mK_kdl: cute.Tensor,
+        tma_atom_v: cute.CopyAtom,
+        mV_dkl: cute.Tensor,
+        tma_atom_o: cute.CopyAtom,
+        mO_qdl: cute.Tensor,
+        cum_seqlen_q: Optional[cute.Tensor],
+        cum_seqlen_k: Optional[cute.Tensor],
+        mLSE: Optional[cute.Tensor],
+        scale_softmax_log2: Float32,
+        scale_softmax: Float32,
+        scale_output: Float32,
+        window_size_left: Optional[Int32],
+        window_size_right: Optional[Int32],
+        q_smem_layout_staged: cute.ComposedLayout,
+        k_smem_layout_staged: cute.ComposedLayout,
+        p_tmem_layout_staged: cute.ComposedLayout,
+        v_smem_layout_staged: cute.ComposedLayout,
+        o_smem_layout_staged: cute.ComposedLayout,
+        tile_sched_params: fmha_utils.FmhaStaticTileSchedulerParams,
+    ):
+        """The device kernel implementation of the Fused Multi-Head Attention.
+
+        This kernel coordinates multiple specialized warps to perform different phases of the FMHA computation:
+        1. Load warp: Loads Q, K, V data from global memory to shared memory using TMA
+        2. MMA warp: Performs matrix multiplications (Q*K^T and P*V)
+        3. Softmax warps: Compute softmax normalization on attention scores
+        4. Correction warps: Apply adjustments to intermediate results
+        5. Epilogue warp: Handles final output transformation and storage
+
+        The kernel implements a complex pipeline with overlapping computation and memory operations,
+        using tensor memory access (TMA) for efficient data loading, warp specialization for different
+        computation phases, and optional attention masking.
+
+        :param qk_tiled_mma: Tiled MMA for Q*K^T
+        :type qk_tiled_mma: cute.TiledMma
+        :param pv_tiled_mma: Tiled MMA for P*V
+        :type pv_tiled_mma: cute.TiledMma
+        :param tma_atom_q: TMA copy atom for query tensor
+        :type tma_atom_q: cute.CopyAtom
+        :param mQ_qdl: Partitioned query tensor
+        :type mQ_qdl: cute.Tensor
+        :param tma_atom_k: TMA copy atom for key tensor
+        :type tma_atom_k: cute.CopyAtom
+        :param mK_kdl: Partitioned key tensor
+        :type mK_kdl: cute.Tensor
+        :param tma_atom_v: TMA copy atom for value tensor
+        :type tma_atom_v: cute.CopyAtom
+        :param mV_dkl: Partitioned value tensor
+        :type mV_dkl: cute.Tensor
+        :param tma_atom_o: TMA copy atom for output tensor
+        :type tma_atom_o: cute.CopyAtom
+        :param mO_qdl: Partitioned output tensor
+        :type mO_qdl: cute.Tensor
+        :param scale_softmax_log2: The log2 scale factor for softmax
+        :type scale_softmax_log2: Float32
+        :param scale_output: The scale factor for the output
+        :type scale_output: Float32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+        :param q_smem_layout_staged: Shared memory layout for query tensor
+        :type q_smem_layout_staged: cute.ComposedLayout
+        :param k_smem_layout_staged: Shared memory layout for key tensor
+        :type k_smem_layout_staged: cute.ComposedLayout
+        :param p_tmem_layout_staged: Tensor memory layout for probability matrix
+        :type p_tmem_layout_staged: cute.ComposedLayout
+        :param v_smem_layout_staged: Shared memory layout for value tensor
+        :type v_smem_layout_staged: cute.ComposedLayout
+        :param o_smem_layout_staged: Shared memory layout for output tensor
+        :type o_smem_layout_staged: cute.ComposedLayout
+        :param tile_sched_params: Scheduling parameters for work distribution
+        :type tile_sched_params: fmha_utils.FmhaStaticTileSchedulerParams
+        """
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.load_warp_id:
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_q)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_k)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_v)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_o)
+
+        # Alloc
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        load_q_producer, load_q_consumer = pipeline.PipelineTmaUmma.create(
+            num_stages=self.q_stage,
+            producer_group=make_thread_cooperative_group(len([self.load_warp_id])),
+            consumer_group=make_thread_cooperative_group(len([self.mma_warp_id])),
+            tx_count=self.tma_copy_q_bytes,
+            barrier_storage=storage.load_q_mbar_ptr.data_ptr(),
+        ).make_participants()
+        load_kv_producer, load_kv_consumer = pipeline.PipelineTmaUmma.create(
+            num_stages=self.kv_stage,
+            producer_group=make_thread_cooperative_group(len([self.load_warp_id])),
+            consumer_group=make_thread_cooperative_group(len([self.mma_warp_id])),
+            tx_count=self.tma_copy_kv_bytes,
+            barrier_storage=storage.load_kv_mbar_ptr.data_ptr(),
+        ).make_participants()
+        mma_s0_producer, mma_s0_consumer = pipeline.PipelineUmmaAsync.create(
+            num_stages=self.mma_softmax_stage,
+            producer_group=make_thread_cooperative_group(len([self.mma_warp_id])),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.softmax0_warp_ids)),
+            barrier_storage=storage.mma_s0_mbar_ptr.data_ptr(),
+        ).make_participants()
+        mma_s1_producer, mma_s1_consumer = pipeline.PipelineUmmaAsync.create(
+            num_stages=self.mma_softmax_stage,
+            producer_group=make_thread_cooperative_group(len([self.mma_warp_id])),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.softmax1_warp_ids)),
+            barrier_storage=storage.mma_s1_mbar_ptr.data_ptr(),
+        ).make_participants()
+        s0_corr_producer, s0_corr_consumer = pipeline.PipelineAsync.create(
+            num_stages=self.softmax_corr_stage,
+            producer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.softmax0_warp_ids)),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.correction_warp_ids)),
+            barrier_storage=storage.s0_corr_mbar_ptr.data_ptr(),
+        ).make_participants()
+        s1_corr_producer, s1_corr_consumer = pipeline.PipelineAsync.create(
+            num_stages=self.softmax_corr_stage,
+            producer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.softmax1_warp_ids)),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.correction_warp_ids)),
+            barrier_storage=storage.s1_corr_mbar_ptr.data_ptr(),
+        ).make_participants()
+        corr_epi_producer, corr_epi_consumer = pipeline.PipelineAsync.create(
+            num_stages=self.epi_stage,
+            producer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.correction_warp_ids)),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len([self.epilogue_warp_id])),
+            barrier_storage=storage.corr_epi_mbar_ptr.data_ptr(),
+        ).make_participants()
+        mma_corr_producer, mma_corr_consumer = pipeline.PipelineUmmaAsync.create(
+            num_stages=self.mma_corr_stage,
+            producer_group=make_thread_cooperative_group(len([self.mma_warp_id])),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.correction_warp_ids)),
+            barrier_storage=storage.mma_corr_mbar_ptr.data_ptr(),
+        ).make_participants()
+        s0_s1_sequence_producer, s0_s1_sequence_consumer = pipeline.PipelineAsync.create(
+            num_stages=1,
+            producer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.softmax0_warp_ids)),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.softmax1_warp_ids)),
+            barrier_storage=storage.s0_s1_sequence_mbar_ptr.data_ptr(),
+        ).make_participants()
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+
+        #  Correction & Epilogue & tmem barrier init
+        if warp_idx == self.empty_warp_id:
+            cute.arch.mbarrier_init(
+                tmem_dealloc_mbar_ptr,
+                self.threads_per_warp
+                * len(
+                    (
+                        *self.softmax0_warp_ids,
+                        *self.softmax1_warp_ids,
+                        *self.correction_warp_ids,
+                    )
+                ),
+            )
+        cute.arch.mbarrier_init_fence()
+
+        #  Generate smem tensor Q/K/V/O
+        # (MMA, MMA_Q, MMA_D, PIPE)
+        sQ = storage.sQ.get_tensor(q_smem_layout_staged.outer, swizzle=q_smem_layout_staged.inner)
+        # (MMA, MMA_K, MMA_D, PIPE)
+        sK = storage.sK.get_tensor(k_smem_layout_staged.outer, swizzle=k_smem_layout_staged.inner)
+        # (MMA, MMA_K, MMA_D, PIPE)
+        # Strip swizzle info to reuse smem
+        sV_ptr = cute.recast_ptr(sK.iterator, v_smem_layout_staged.inner)
+        sV = cute.make_tensor(sV_ptr, v_smem_layout_staged.outer)
+        sO = storage.sO.get_tensor(o_smem_layout_staged.outer, swizzle=o_smem_layout_staged.inner)
+        qk_thr_mma = qk_tiled_mma.get_slice(0)  # default 1sm
+        pv_thr_mma = pv_tiled_mma.get_slice(0)  # default 1sm
+        tSrQ = qk_thr_mma.make_fragment_A(sQ)
+        tSrK = qk_thr_mma.make_fragment_B(sK)
+        tOrV = pv_thr_mma.make_fragment_B(sV)
+        qk_acc_shape = qk_thr_mma.partition_shape_C((self.qk_mma_tiler[0], self.qk_mma_tiler[1]))
+        tStS = qk_thr_mma.make_fragment_C(qk_acc_shape)
+        pv_acc_shape = pv_thr_mma.partition_shape_C((self.pv_mma_tiler[0], self.pv_mma_tiler[1]))
+        tOtO = pv_thr_mma.make_fragment_C(pv_acc_shape)
+
+        tStS0 = cute.make_tensor(tStS.iterator + self.tmem_s0_offset, tStS.layout)
+        tStS1 = cute.make_tensor(tStS.iterator + self.tmem_s1_offset, tStS.layout)
+        tOtO0 = cute.make_tensor(tOtO.iterator + self.tmem_o0_offset, tOtO.layout)
+        tOtO1 = cute.make_tensor(tOtO.iterator + self.tmem_o1_offset, tOtO.layout)
+
+        tP = cute.make_tensor(tStS.iterator, p_tmem_layout_staged.outer)
+        tOrP = pv_thr_mma.make_fragment_A(tP)[None, None, None, 0]
+        tOrP0 = cute.make_tensor(
+            tOrP.iterator + self.qk_acc_dtype.width // self.q_dtype.width * self.tmem_p0_offset,
+            tOrP.layout,
+        )
+        tOrP1 = cute.make_tensor(
+            tOrP.iterator + self.qk_acc_dtype.width // self.q_dtype.width * self.tmem_p1_offset,
+            tOrP.layout,
+        )
+        cute.arch.barrier(
+            barrier_id=self.cta_sync_bar_id,
+            number_of_threads=self.threads_per_cta,
+        )
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  EMPTY
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.empty_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  LOAD
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.load_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            tile_sched = fmha_utils.create_fmha_static_tile_scheduler(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+                batch_coord = curr_block_coord[2][1]
+                continue_cond = False
+                cuseqlen_q = Int32(0)
+                seqlen_q = mQ_qdl.shape[0]
+                if cutlass.const_expr(cum_seqlen_q is not None):
+                    cuseqlen_q = cum_seqlen_q[batch_coord]
+                    seqlen_q = cum_seqlen_q[batch_coord + 1] - cuseqlen_q
+                    continue_cond = not fmha_utils.FmhaStaticTileScheduler.check_valid_work_for_seqlen_q(
+                        self.cta_tiler[0],
+                        curr_block_coord[0],
+                        seqlen_q,
+                    )
+                if not continue_cond:
+                    mQ_qdl_ = mQ_qdl
+                    mK_kdl_ = mK_kdl
+                    mV_dkl_ = mV_dkl
+                    seqlen_k = mK_kdl.shape[0]
+                    curr_block_coord_q = curr_block_coord
+                    curr_block_coord_kv = curr_block_coord
+
+                    if cutlass.const_expr(cum_seqlen_q is not None):
+                        logical_offset_mQ = (
+                            mQ_qdl.shape[0] - seqlen_q,
+                            0,
+                            (0, cuseqlen_q + seqlen_q),
+                        )
+                        mQ_qdl_ = cute.domain_offset(logical_offset_mQ, mQ_qdl)
+                        curr_block_coord_q = (
+                            curr_block_coord[0],
+                            curr_block_coord[1],
+                            (curr_block_coord[2][0], Int32(0)),
+                        )
+
+                    if cutlass.const_expr(cum_seqlen_k is not None):
+                        cuseqlen_k = cum_seqlen_k[batch_coord]
+                        seqlen_k = cum_seqlen_k[batch_coord + 1] - cuseqlen_k
+                        logical_offset_mK = (
+                            mK_kdl.shape[0] - seqlen_k,
+                            0,
+                            (0, cuseqlen_k + seqlen_k),
+                        )
+                        logical_offset_mV = (
+                            0,
+                            mK_kdl.shape[0] - seqlen_k,
+                            (0, cuseqlen_k + seqlen_k),
+                        )
+                        mK_kdl_ = cute.domain_offset(logical_offset_mK, mK_kdl)
+                        mV_dkl_ = cute.domain_offset(logical_offset_mV, mV_dkl)
+                        curr_block_coord_kv = (
+                            curr_block_coord[0],
+                            curr_block_coord[1],
+                            (curr_block_coord[2][0], Int32(0)),
+                        )
+
+                    # Local tile partition global tensors
+                    # (bM, bK, loopM, loopK, loopL)
+                    gQ_qdl = cute.flat_divide(mQ_qdl_, cute.select(self.qk_mma_tiler, mode=[0, 2]))
+                    tSgQ_qdl = qk_thr_mma.partition_A(gQ_qdl)
+                    tQsQ, tQgQ_qdl = cute.nvgpu.cpasync.tma_partition(
+                        tma_atom_q,
+                        0,  # no multicast
+                        cute.make_layout(1),
+                        cute.group_modes(sQ, 0, 3),
+                        cute.group_modes(tSgQ_qdl, 0, 3),
+                    )
+                    tQgQ = tQgQ_qdl[None, None, 0, curr_block_coord_q[2]]
+
+                    gK_kdl = cute.flat_divide(mK_kdl_, cute.select(self.qk_mma_tiler, mode=[1, 2]))
+                    tSgK_kdl = qk_thr_mma.partition_B(gK_kdl)
+                    tKsK, tKgK_kdl = cute.nvgpu.cpasync.tma_partition(
+                        tma_atom_k,
+                        0,  # no multicast
+                        cute.make_layout(1),
+                        cute.group_modes(sK, 0, 3),
+                        cute.group_modes(tSgK_kdl, 0, 3),
+                    )
+                    tKgK = tKgK_kdl[None, None, 0, curr_block_coord_kv[2]]
+
+                    gV_dkl = cute.flat_divide(mV_dkl_, cute.select(self.pv_mma_tiler, mode=[1, 2]))
+                    tSgV_dkl = pv_thr_mma.partition_B(gV_dkl)
+                    tVsV, tVgV_dkl = cute.nvgpu.cpasync.tma_partition(
+                        tma_atom_v,
+                        0,  # no multicast
+                        cute.make_layout(1),
+                        cute.group_modes(sV, 0, 3),
+                        cute.group_modes(tSgV_dkl, 0, 3),
+                    )
+                    tVgV = tVgV_dkl[None, 0, None, curr_block_coord_kv[2]]
+
+                    # Q0
+                    q0_coord = 2 * curr_block_coord_q[0]
+                    q0_handle = load_q_producer.acquire_and_advance()
+                    cute.copy(
+                        tma_atom_q,
+                        tQgQ[None, q0_coord],
+                        tQsQ[None, q0_handle.index],
+                        tma_bar_ptr=q0_handle.barrier,
+                    )
+                    # K0
+                    seqlen_kv_loop_start = fmha_utils.FusedMask.get_trip_start(
+                        self.mask_type,
+                        curr_block_coord,
+                        self.cta_tiler,
+                        seqlen_q,
+                        seqlen_k,
+                        window_size_left,
+                    )
+                    kv_coord = seqlen_kv_loop_start
+                    k_handle = load_kv_producer.acquire_and_advance()
+                    cute.copy(
+                        tma_atom_k,
+                        tKgK[None, kv_coord],
+                        tKsK[None, k_handle.index],
+                        tma_bar_ptr=k_handle.barrier,
+                    )
+                    # Q1
+                    q1_coord = q0_coord + 1
+                    q1_handle = load_q_producer.acquire_and_advance()
+                    cute.copy(
+                        tma_atom_q,
+                        tQgQ[None, q1_coord],
+                        tQsQ[None, q1_handle.index],
+                        tma_bar_ptr=q1_handle.barrier,
+                    )
+                    # V0
+                    v_handle = load_kv_producer.acquire_and_advance()
+                    cute.copy(
+                        tma_atom_v,
+                        tVgV[None, kv_coord],
+                        tVsV[None, v_handle.index],
+                        tma_bar_ptr=v_handle.barrier,
+                    )
+                    kv_coord += 1
+
+                    seqlen_kv_loop_steps = (
+                        fmha_utils.FusedMask.get_trip_count(
+                            self.mask_type,
+                            curr_block_coord,
+                            self.cta_tiler,
+                            seqlen_q,
+                            seqlen_k,
+                            window_size_left,
+                            window_size_right,
+                        )
+                        - 1
+                    )
+
+                    for i in cutlass.range(0, seqlen_kv_loop_steps, 1, unroll=1):
+                        # Ki
+                        k_handle = load_kv_producer.acquire_and_advance()
+                        cute.copy(
+                            tma_atom_k,
+                            tKgK[None, kv_coord],
+                            tKsK[None, k_handle.index],
+                            tma_bar_ptr=k_handle.barrier,
+                        )
+                        # Vi
+                        v_handle = load_kv_producer.acquire_and_advance()
+                        cute.copy(
+                            tma_atom_v,
+                            tVgV[None, kv_coord],
+                            tVsV[None, v_handle.index],
+                            tma_bar_ptr=v_handle.barrier,
+                        )
+                        kv_coord += 1
+                    # End of seqlen_kv loop
+
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+                # End of persistent scheduler loop
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  MMA
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.mma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            # Alloc tmem buffer
+            tmem_alloc_cols = Int32(self.tmem_alloc_cols)
+            cute.arch.alloc_tmem(tmem_alloc_cols, storage.tmem_holding_buf)
+            cute.arch.barrier(
+                barrier_id=self.tmem_alloc_sync_bar_id,
+                number_of_threads=self.threads_per_warp,
+            )
+            tile_sched = fmha_utils.create_fmha_static_tile_scheduler(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+                batch_coord = curr_block_coord[2][1]
+                continue_cond = False
+                seqlen_q = mQ_qdl.shape[0]
+                if cutlass.const_expr(cum_seqlen_q is not None):
+                    cuseqlen_q = cum_seqlen_q[batch_coord]
+                    seqlen_q = cum_seqlen_q[batch_coord + 1] - cuseqlen_q
+                    continue_cond = not fmha_utils.FmhaStaticTileScheduler.check_valid_work_for_seqlen_q(
+                        self.cta_tiler[0],
+                        curr_block_coord[0],
+                        seqlen_q,
+                    )
+
+                if not continue_cond:
+                    seqlen_k = mK_kdl.shape[0]
+                    if cutlass.const_expr(cum_seqlen_k is not None):
+                        cuseqlen_k = cum_seqlen_k[batch_coord]
+                        seqlen_k = cum_seqlen_k[batch_coord + 1] - cuseqlen_k
+
+                    # GEMM_QK00 (Q0 * K0 -> S0)
+                    # 1. wait for Q0
+                    q0_handle = load_q_consumer.wait_and_advance()
+                    tSrQ0 = tSrQ[None, None, None, q0_handle.index]
+                    # 2. wait for K0
+                    k_handle = load_kv_consumer.wait_and_advance()
+                    tSrK0 = tSrK[None, None, None, k_handle.index]
+                    # 3. acquire empty S0 buffer
+                    s0_handle = mma_s0_producer.acquire_and_advance()
+                    # 4. gemm
+                    num_kphases = cute.size(tSrQ0, mode=[2])
+                    for kphase_idx in cutlass.range(num_kphases, unroll_full=True):
+                        kphase_coord = (None, None, kphase_idx)
+                        qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                        cute.gemm(
+                            qk_tiled_mma,
+                            tStS0,
+                            tSrQ0[kphase_coord],
+                            tSrK0[kphase_coord],
+                            tStS0,
+                        )
+                    # 5. release S0
+                    s0_handle.commit()
+                    # End of GEMM (Q0 * K0 -> S0)
+
+                    # GEMM_QK10 (Q1 * K0 -> S1), K0 is ready in GEMM_QK00
+                    # 1. wait for Q1
+                    q1_handle = load_q_consumer.wait_and_advance()
+                    tSrQ1 = tSrQ[None, None, None, q1_handle.index]
+                    # 2. acquire empty S1
+                    s1_handle = mma_s1_producer.acquire_and_advance()
+                    # 3. gemm
+                    num_kphases = cute.size(tSrQ1, mode=[2])
+                    for kphase_idx in cutlass.range(num_kphases, unroll_full=True):
+                        kphase_coord = (None, None, kphase_idx)
+                        qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                        cute.gemm(
+                            qk_tiled_mma,
+                            tStS1,
+                            tSrQ1[kphase_coord],
+                            tSrK0[kphase_coord],
+                            tStS1,
+                        )
+                    # 4. release S1
+                    s1_handle.commit()
+                    # 5. release K0
+                    k_handle.release()
+                    # End of GEMM (Q1 * K0 -> S1)
+                    # Note: Q0 & Q1 are still needed in the seqlen_kv loop
+                    # so we need to release them after the seqlen_kv loop
+
+                    # GEMM_PV00 (P0 * V0 -> O0_partial), O0 needs to be accumulated in the seqlen_kv loop
+                    # 1. wait for V0
+                    v_handle = load_kv_consumer.wait_and_advance()
+                    tOrVi = tOrV[None, None, None, v_handle.index]
+                    # 2. acquire corrected O0_partial
+                    # Note: acquire corr first to take it out of the critical
+                    # path since softmax takes longer
+                    o0_handle = mma_corr_producer.acquire_and_advance()
+                    # 3. acquire P0
+                    # this acquire returns the ownership of all of S0 to the mma warp
+                    # including the P0 part (inplaced in S0)
+                    s0_handle = mma_s0_producer.acquire_and_advance()
+                    # 4. gemm
+                    num_kphases = cute.size(tOrP0, mode=[2])
+                    for kphase_idx in cutlass.range(num_kphases, unroll_full=True):
+                        kphase_coord = (None, None, kphase_idx)
+                        pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                        cute.gemm(
+                            pv_tiled_mma,
+                            tOtO0,
+                            tOrP0[kphase_coord],
+                            tOrVi[kphase_coord],
+                            tOtO0,
+                        )
+                    # 5. release accumulated O0_partial
+                    o0_handle.commit()
+                    # End of GEMM_PV00 (P0 * V0 -> O0_partial)
+
+                    seqlen_kv_loop_steps = (
+                        fmha_utils.FusedMask.get_trip_count(
+                            self.mask_type,
+                            curr_block_coord,
+                            self.cta_tiler,
+                            seqlen_q,
+                            seqlen_k,
+                            window_size_left,
+                            window_size_right,
+                        )
+                        - 1
+                    )
+
+                    # O1 hasn't been accumulated yet, its first MMA calculation doesn't need to accumulate
+                    pv_whether_acc = False
+
+                    for i in cutlass.range(0, seqlen_kv_loop_steps, 1, unroll=1):
+                        # GEMM_QK0i (Q0 * Ki -> S0)
+                        # 1. wait for Ki
+                        k_handle = load_kv_consumer.wait_and_advance()
+                        tSrKi = tSrK[None, None, None, k_handle.index]
+                        # 2. gemm
+                        inner_num_kphases = cute.size(tSrQ0, mode=[2])
+                        for kphase_idx in cutlass.range(inner_num_kphases, unroll_full=True):
+                            kphase_coord = (None, None, kphase_idx)
+                            qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                            cute.gemm(
+                                qk_tiled_mma,
+                                tStS0,
+                                tSrQ0[kphase_coord],
+                                tSrKi[kphase_coord],
+                                tStS0,
+                            )
+                        # 3. release S0
+                        s0_handle.commit()
+                        # End of GEMM_QK0i (Q0 * Ki -> S0)
+
+                        # GEMM_PV1(i-1) (P1 * V(i-1) -> O1_partial), V(i-1) is ready in GEMM_PV0(i-1)
+                        # 1. acquire corrected O1_partial
+                        o1_handle = mma_corr_producer.acquire_and_advance()
+                        # 2. acquire P1
+                        s1_handle = mma_s1_producer.acquire_and_advance()
+                        # 3. gemm
+                        inner_num_kphases = cute.size(tOrP0, mode=[2])
+                        for kphase_idx in cutlass.range(inner_num_kphases, unroll_full=True):
+                            kphase_coord = (None, None, kphase_idx)
+                            pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, pv_whether_acc)
+                            cute.gemm(
+                                pv_tiled_mma,
+                                tOtO1,
+                                tOrP1[kphase_coord],
+                                tOrVi[kphase_coord],
+                                tOtO1,
+                            )
+                            pv_whether_acc = True
+                        # 4. release accumulated O1_partial
+                        o1_handle.commit()
+                        # 5. release V(i-1)
+                        v_handle.release()
+                        # End of GEMM_PV1(i-1) (P1 * V(i-1) -> O1_partial)
+
+                        # GEMM_QK1i (Q1 * Ki -> S1), Q1 is ready in GEMM_QK10; Ki is ready in GEMM_QK0i
+                        # 1. gemm
+                        inner_num_kphases = cute.size(tSrQ1, mode=[2])
+                        for kphase_idx in cutlass.range(inner_num_kphases, unroll_full=True):
+                            kphase_coord = (None, None, kphase_idx)
+                            qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0)
+                            cute.gemm(
+                                qk_tiled_mma,
+                                tStS1,
+                                tSrQ1[kphase_coord],
+                                tSrKi[kphase_coord],
+                                tStS1,
+                            )
+                        s1_handle.commit()
+                        # 2. release Ki
+                        k_handle.release()
+                        # End of GEMM_QK1i (Q1 * Ki -> S1)
+
+                        # GEMM_PV0i (P0 * Vi -> O0_partial)
+                        # 1. wait for Vi
+                        v_handle = load_kv_consumer.wait_and_advance()
+                        tOrVi = tOrV[None, None, None, v_handle.index]
+                        # 2. acquire corrected O0_partial
+                        o0_handle = mma_corr_producer.acquire_and_advance()
+                        # 3. acquire P0
+                        s0_handle = mma_s0_producer.acquire_and_advance()
+                        # 4. gemm
+                        inner_num_kphases = cute.size(tOrP0, mode=[2])
+                        for kphase_idx in cutlass.range(inner_num_kphases, unroll_full=True):
+                            kphase_coord = (None, None, kphase_idx)
+                            pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+                            cute.gemm(
+                                pv_tiled_mma,
+                                tOtO0,
+                                tOrP0[kphase_coord],
+                                tOrVi[kphase_coord],
+                                tOtO0,
+                            )
+                        # 5. release accumulated O0_partial
+                        o0_handle.commit()
+                        # End of GEMM_PV0i (P0 * Vi -> O0_partial)
+                    # End of seqlen_kv loop
+
+                    # release Q0 & Q1
+                    q0_handle.release()
+                    q1_handle.release()
+
+                    # GEMM_PV1(i_end) (P1 * Vi_end -> O1)
+                    # 1. acquire corrected O1_partial
+                    o1_handle = mma_corr_producer.acquire_and_advance()
+                    # 2. acquire P1
+                    s1_handle = mma_s1_producer.acquire_and_advance()
+                    # 3. gemm
+                    num_kphases = cute.size(tOrP1, mode=[2])
+                    for kphase_idx in cutlass.range(num_kphases, unroll_full=True):
+                        kphase_coord = (None, None, kphase_idx)
+                        pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, pv_whether_acc)
+                        cute.gemm(
+                            pv_tiled_mma,
+                            tOtO1,
+                            tOrP1[kphase_coord],
+                            tOrVi[kphase_coord],
+                            tOtO1,
+                        )
+                        pv_whether_acc = True
+                    # 4. commit accumulated O1
+                    o1_handle.commit()
+                    # 5. release Vi_end
+                    v_handle.release()
+                    # End of GEMM_PV1(i_end) (P1 * Vi_end -> O1)
+
+                    # Commit S0 and S1
+                    s0_handle.commit()
+                    s1_handle.commit()
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+            # End of persistent scheduler loop
+
+            # dealloc tmem buffer
+            cute.arch.relinquish_tmem_alloc_permit()
+            cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
+            tmem_alloc_cols = Int32(self.tmem_alloc_cols)
+            #  Retrieving tmem ptr and make acc
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                Float32,
+                alignment=16,
+                ptr_to_buffer_holding_addr=storage.tmem_holding_buf,
+            )
+            cute.arch.dealloc_tmem(tmem_ptr, tmem_alloc_cols)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Epilogue
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx == self.epilogue_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+            tile_sched = fmha_utils.create_fmha_static_tile_scheduler(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+                batch_coord = curr_block_coord[2][1]
+                continue_cond = False
+                cuseqlen_q = Int32(0)
+                seqlen_q = mQ_qdl.shape[0]
+
+                if cutlass.const_expr(cum_seqlen_q is not None):
+                    cuseqlen_q = cum_seqlen_q[batch_coord]
+                    seqlen_q = cum_seqlen_q[batch_coord + 1] - cuseqlen_q
+                    continue_cond = not fmha_utils.FmhaStaticTileScheduler.check_valid_work_for_seqlen_q(
+                        self.cta_tiler[0],
+                        curr_block_coord[0],
+                        seqlen_q,
+                    )
+                if not continue_cond:
+                    curr_block_coord_o = curr_block_coord
+                    mO_qdl_ = mO_qdl
+                    if cutlass.const_expr(cum_seqlen_q is not None):
+                        logical_offset_mO = (
+                            mO_qdl_.shape[0] - seqlen_q,
+                            0,
+                            (0, cuseqlen_q + seqlen_q),
+                        )
+                        mO_qdl_ = cute.domain_offset(logical_offset_mO, mO_qdl_)
+                        curr_block_coord_o = (
+                            curr_block_coord[0],
+                            curr_block_coord[1],
+                            (curr_block_coord[2][0], 0),
+                        )
+
+                    o0_coord = 2 * curr_block_coord_o[0]
+                    o1_coord = o0_coord + 1
+                    gO_qdl = cute.flat_divide(mO_qdl_, cute.select(self.pv_mma_tiler, mode=[0, 1]))
+                    gO = gO_qdl[None, None, None, 0, curr_block_coord_o[2]]
+                    tOsO, tOgO = cute.nvgpu.cpasync.tma_partition(
+                        tma_atom_o,
+                        0,
+                        cute.make_layout(1),
+                        cute.group_modes(sO, 0, 2),
+                        cute.group_modes(gO, 0, 2),
+                    )
+
+                    # O0 O1 using the same pipeline
+                    # wait from corr, issue tma store on smem
+                    # O0
+                    # 1. wait for O0 final
+                    o0_handle = corr_epi_consumer.wait_and_advance()
+                    # 2. copy O0 to gmem
+                    cute.copy(tma_atom_o, tOsO[None, 0], tOgO[None, o0_coord])
+                    cute.arch.cp_async_bulk_commit_group()
+                    # O1
+                    # 1. wait for O1 final
+                    o1_handle = corr_epi_consumer.wait_and_advance()
+                    # 2. copy O1 to gmem
+                    cute.copy(tma_atom_o, tOsO[None, 1], tOgO[None, o1_coord])
+                    cute.arch.cp_async_bulk_commit_group()
+
+                    # Ensure O0 buffer is ready to be released
+                    cute.arch.cp_async_bulk_wait_group(1, read=True)
+                    o0_handle.release()
+                    # Ensure O1 buffer is ready to be released
+                    cute.arch.cp_async_bulk_wait_group(0, read=True)
+                    o1_handle.release()
+
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+            # End of persistent scheduler loop
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Softmax0
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx < self.softmax1_warp_ids[0]:
+            # increase register after decreasing
+            cute.arch.warpgroup_reg_alloc(self.num_regs_softmax)
+
+            self.softmax(
+                stage=0,
+                seqlen_k=mK_kdl.shape[0],
+                seqlen_q=mQ_qdl.shape[0],
+                cum_seqlen_q=cum_seqlen_q,
+                cum_seqlen_k=cum_seqlen_k,
+                scale_softmax_log2=scale_softmax_log2,
+                qk_thr_mma=qk_thr_mma,
+                tStS=tStS,
+                tStSi=tStS0,
+                window_size_left=window_size_left,
+                window_size_right=window_size_right,
+                mma_si_consumer=mma_s0_consumer,
+                si_corr_producer=s0_corr_producer,
+                s0_s1_sequence_consumer=s0_s1_sequence_consumer,
+                s0_s1_sequence_producer=s0_s1_sequence_producer,
+                tile_sched_params=tile_sched_params,
+            )
+            cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Softmax1
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx < self.correction_warp_ids[0] and warp_idx >= self.softmax1_warp_ids[0]:
+            # increase register after decreasing
+            cute.arch.warpgroup_reg_alloc(self.num_regs_softmax)
+
+            self.softmax(
+                stage=1,
+                seqlen_k=mK_kdl.shape[0],
+                seqlen_q=mQ_qdl.shape[0],
+                cum_seqlen_q=cum_seqlen_q,
+                cum_seqlen_k=cum_seqlen_k,
+                scale_softmax_log2=scale_softmax_log2,
+                qk_thr_mma=qk_thr_mma,
+                tStS=tStS,
+                tStSi=tStS1,
+                window_size_left=window_size_left,
+                window_size_right=window_size_right,
+                mma_si_consumer=mma_s1_consumer,
+                si_corr_producer=s1_corr_producer,
+                s0_s1_sequence_consumer=s0_s1_sequence_consumer,
+                s0_s1_sequence_producer=s0_s1_sequence_producer,
+                tile_sched_params=tile_sched_params,
+            )
+            cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr)
+
+        # ///////////////////////////////////////////////////////////////////////////////
+        #  Correction
+        # ///////////////////////////////////////////////////////////////////////////////
+        if warp_idx >= self.correction_warp_ids[0] and warp_idx < self.mma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_correction)
+
+            cS = cute.make_identity_tensor((self.qk_mma_tiler[0], self.qk_mma_tiler[1]))
+            tScS = qk_thr_mma.partition_C(cS)
+
+            tStS_vec_layout = cute.composition(tStS.layout, cute.make_layout((128, 2)))
+
+            tStS_vec0 = cute.make_tensor(tStS.iterator + self.tmem_vec0_offset, tStS_vec_layout)
+            tStS_vec1 = cute.make_tensor(tStS.iterator + self.tmem_vec1_offset, tStS_vec_layout)
+
+            tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2)))
+            tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout)
+
+            tmem_load_v_atom = cute.make_copy_atom(
+                tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(2)),
+                self.qk_acc_dtype,
+            )
+
+            tiled_tmem_load_vec = tcgen05.make_tmem_copy(tmem_load_v_atom, tStS_vec0)
+            thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids))
+            thr_tmem_load_vec = tiled_tmem_load_vec.get_slice(thread_idx)
+
+            tTMEM_LOAD_VECtS0 = thr_tmem_load_vec.partition_S(tStS_vec0)
+            tTMEM_LOAD_VECtS1 = thr_tmem_load_vec.partition_S(tStS_vec1)
+            tTMEM_LOAD_VECcS = thr_tmem_load_vec.partition_D(tScS_vec)
+
+            tile_sched = fmha_utils.create_fmha_static_tile_scheduler(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+            work_tile = tile_sched.initial_work_tile_info()
+
+            while work_tile.is_valid_tile:
+                curr_block_coord = work_tile.tile_idx
+                curr_block_coord_lse = curr_block_coord
+                batch_coord = curr_block_coord[2][1]
+                seqlen_k = mK_kdl.shape[0]
+                continue_cond = False
+                cuseqlen_q = Int32(0)
+                seqlen_q = mQ_qdl.shape[0]
+
+                if cutlass.const_expr(cum_seqlen_q is not None):
+                    cuseqlen_q = cum_seqlen_q[batch_coord]
+                    seqlen_q = cum_seqlen_q[batch_coord + 1] - cuseqlen_q
+                    # for varlen LSE, batch == 1
+                    curr_block_coord_lse = (
+                        curr_block_coord[0],
+                        curr_block_coord[1],
+                        (curr_block_coord[2][0], 0),
+                    )
+                    continue_cond = not fmha_utils.FmhaStaticTileScheduler.check_valid_work_for_seqlen_q(
+                        self.cta_tiler[0],
+                        curr_block_coord[0],
+                        seqlen_q,
+                    )
+
+                if not continue_cond:
+                    row_idx = curr_block_coord[0] * self.cta_tiler[0] + tTMEM_LOAD_VECcS[0][0]
+                    if cutlass.const_expr(cum_seqlen_k is not None):
+                        cuseqlen_k = cum_seqlen_k[batch_coord]
+                        seqlen_k = cum_seqlen_k[batch_coord + 1] - cuseqlen_k
+                    # Ignore first signal from softmax as no correction is required
+                    vec0_handle = s0_corr_consumer.wait_and_advance()
+                    vec0_handle.release()
+                    vec1_handle = s1_corr_consumer.wait_and_advance()
+
+                    seqlen_kv_loop_steps = (
+                        fmha_utils.FusedMask.get_trip_count(
+                            self.mask_type,
+                            curr_block_coord,
+                            self.cta_tiler,
+                            seqlen_q,
+                            seqlen_k,
+                            window_size_left,
+                            window_size_right,
+                        )
+                        - 1
+                    )
+                    for i in cutlass.range(0, seqlen_kv_loop_steps, 1, unroll=1):
+                        # wait for vec0 (row_wise current max & previous max)
+                        vec0_handle = s0_corr_consumer.wait_and_advance()
+                        tTMEM_LOAD_VECrS = cute.make_rmem_tensor(tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype)
+                        cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS0, tTMEM_LOAD_VECrS)
+                        scale_ = scale_softmax_log2 * (tTMEM_LOAD_VECrS[0] - tTMEM_LOAD_VECrS[1])
+
+                        scale = cute.math.exp2(scale_, fastmath=True)
+                        # wait for o0
+                        o0_handle = mma_corr_consumer.wait_and_advance()
+                        self.correction_rescale(pv_thr_mma, tOtO0, scale)
+                        # release vec1 & o0
+                        vec1_handle.release()
+                        cute.arch.fence_view_async_tmem_store()
+                        o0_handle.release()
+
+                        # wait for vec1 (row_wise current max & previous max)
+                        vec1_handle = s1_corr_consumer.wait_and_advance()
+                        cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS1, tTMEM_LOAD_VECrS)
+                        scale_ = scale_softmax_log2 * (tTMEM_LOAD_VECrS[0] - tTMEM_LOAD_VECrS[1])
+
+                        scale = cute.math.exp2(scale_, fastmath=True)
+                        o1_handle = mma_corr_consumer.wait_and_advance()
+                        self.correction_rescale(pv_thr_mma, tOtO1, scale)
+                        vec0_handle.release()
+                        cute.arch.fence_view_async_tmem_store()
+                        o1_handle.release()
+                    # End of seqlen_corr_loop_steps
+                    vec1_handle.release()
+
+                    # wait for vec0 (row_wise global sum)
+                    vec0_handle = s0_corr_consumer.wait_and_advance()
+                    tTMEM_LOAD_VECrS = cute.make_rmem_tensor(tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype)
+                    cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS0, tTMEM_LOAD_VECrS)
+                    cute.arch.fence_view_async_tmem_load()
+                    vec0_handle.release()
+                    # wait for o0
+                    o0_handle = mma_corr_consumer.wait_and_advance()
+                    o0_final_handle = corr_epi_producer.acquire_and_advance()
+
+                    self.correction_epilog(
+                        pv_thr_mma,
+                        tOtO0,
+                        mLSE,
+                        tTMEM_LOAD_VECrS,
+                        row_idx,
+                        cuseqlen_q,
+                        seqlen_q,
+                        curr_block_coord_lse,
+                        scale_softmax,
+                        scale_output / tTMEM_LOAD_VECrS[0],
+                        sO[None, None, 0],
+                    )
+                    o0_handle.release()
+                    o0_final_handle.commit()
+
+                    # wait for vec1 (row_wise global sum)
+                    vec1_handle = s1_corr_consumer.wait_and_advance()
+                    cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS1, tTMEM_LOAD_VECrS)
+                    cute.arch.fence_view_async_tmem_load()
+                    vec1_handle.release()
+                    # wait for o1
+                    o1_handle = mma_corr_consumer.wait_and_advance()
+                    o1_final_handle = corr_epi_producer.acquire_and_advance()
+                    row_idx += self.qk_mma_tiler[0]
+                    self.correction_epilog(
+                        pv_thr_mma,
+                        tOtO1,
+                        mLSE,
+                        tTMEM_LOAD_VECrS,
+                        row_idx,
+                        cuseqlen_q,
+                        seqlen_q,
+                        curr_block_coord_lse,
+                        scale_softmax,
+                        scale_output / tTMEM_LOAD_VECrS[0],
+                        sO[None, None, 1],
+                    )
+                    o1_handle.release()
+                    o1_final_handle.commit()
+                # Advance to next tile
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+            # End of persistent scheduler loop
+            cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr)
+        return
+
+    @cute.jit
+    def softmax_step(
+        self,
+        stage: int,
+        need_apply_mask: bool,
+        iter_args: tuple,
+        value_args: tuple,
+        pipeline_args: tuple,
+        atom_args: tuple,
+        tensor_args: tuple,
+    ) -> Tuple[
+        Float32,
+        Float32,
+        pipeline.PipelineProducer.ImmutableResourceHandle,
+        pipeline.PipelineConsumer,
+        pipeline.PipelineProducer,
+        pipeline.PipelineConsumer,
+        pipeline.PipelineProducer,
+    ]:
+        """Perform a single step of the softmax computation on a block of attention scores.
+
+        This method processes one block of the attention matrix, computing numerically stable
+        softmax by first finding the row maximum, subtracting it from all elements, applying
+        exponential function, and then normalizing by the sum of exponentials. It also handles
+        optional masking of attention scores.
+
+        The method involves several key operations:
+        1. Loading attention scores from tensor memory
+        2. Applying optional masking based on position
+        3. Computing row-wise maximum values for numerical stability
+        4. Transforming scores using exp2(x*scale - max*scale)
+        5. Computing row sums for normalization
+        6. Coordinating pipeline synchronization between different processing stages
+
+        :param stage: Processing stage (0 for first half, 1 for second half)
+        :type stage: int
+        :param need_apply_mask: Whether to apply attention masking
+        :type need_apply_mask: bool
+        :param iter_args: Tuple containing the counting tensor, row_max, row_sum, and vector buffer's handle for current iteration
+        :type iter_args: tuple
+        :param value_args: Tuple containing seqlen_k, seqlen_q, and scale_softmax_log2
+        :type value_args: tuple
+        :param pipeline_args: Tuple containing pipeline related arguments for MMA, correction, and sequence synchronization
+        :type pipeline_args: tuple
+        :param atom_args: Tuple containing mma & copy atoms
+        :type atom_args: tuple
+        :param tensor_args: Tuple containing softmax related tensors
+        :type tensor_args: tuple
+        :param fused_mask: Compute trip counts and apply masking for attention blocks
+        :type fused_mask: fmha_utils.FusedMask
+        :return: Updated state values (row_max, row_sum, and pipeline related arguments)
+        :rtype: tuple
+        """
+        cS, row_max, row_sum, vec_i_handle = iter_args
+        seqlen_k, seqlen_q, scale_softmax_log2, window_size_left, window_size_right = value_args
+        (
+            mma_si_consumer,
+            si_corr_producer,
+            s0_s1_sequence_consumer,
+            s0_s1_sequence_producer,
+        ) = pipeline_args
+        (
+            qk_thr_mma,
+            tiled_tmem_load,
+            tiled_tmem_store,
+            tiled_tmem_store_vec,
+            thr_tmem_load,
+            thr_tmem_store,
+            thr_tmem_store_vec,
+        ) = atom_args
+        (
+            tTMEM_LOADtS,
+            tTMEM_STORE_VECtS,
+            tTMEM_STOREtS_x4,
+        ) = tensor_args
+
+        tilePlikeFP32 = self.qk_mma_tiler[1] // Float32.width * self.o_dtype.width
+        tScS = qk_thr_mma.partition_C(cS)
+        tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2)))
+        tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout)
+
+        tScS_P_layout = cute.composition(tScS.layout, cute.make_layout((128, tilePlikeFP32)))
+        tScS_P = cute.make_tensor(tScS.iterator, tScS_P_layout)
+        tTMEM_LOADcS = thr_tmem_load.partition_D(tScS)
+        tTMEM_STORE_VECcS = thr_tmem_store_vec.partition_S(tScS_vec)
+        tTMEM_STOREcS = thr_tmem_store.partition_S(tScS_P)
+
+        # Wait for Si
+        si_handle = mma_si_consumer.wait_and_advance()
+        tTMEM_LOADrS = cute.make_rmem_tensor(tTMEM_LOADcS.shape, self.qk_acc_dtype)
+        cute.copy(tiled_tmem_load, tTMEM_LOADtS, tTMEM_LOADrS)
+        if need_apply_mask:
+            if self.mask_type != fmha_utils.MaskType.COMPRESSED_CAUSAL_MASK:
+                fmha_utils.FusedMask.apply_mask(
+                    self.mask_type,
+                    tTMEM_LOADrS,
+                    tTMEM_LOADcS,
+                    seqlen_q,
+                    seqlen_k,
+                    window_size_left,
+                    window_size_right,
+                )
+
+            else:
+                compression_factor = seqlen_q // seqlen_k
+                index_q = tTMEM_LOADcS[0][0]
+                index_k0 = tTMEM_LOADcS[0][1]
+
+                largestUnmaskedK = min(
+                    seqlen_k - 1,
+                    (index_q + 1) // compression_factor - 1,
+                    index_k0 + cute.size(tTMEM_LOADcS),
+                )
+
+                smallestUnmaskedKInWarp = min(
+                    seqlen_k - 1,
+                    (index_q - (cute.arch.thread_idx()[0] % 32)) // compression_factor - 1,
+                )
+
+                largestUnmaskedKInWarp = min(
+                    seqlen_k - 1,
+                    (index_q + 32 - (cute.arch.thread_idx()[0] % 32)) // compression_factor - 1,
+                )
+
+                if smallestUnmaskedKInWarp - tScS[0][1] < 64:
+                    for i in cutlass.range(0, 64):
+                        if i + index_k0 > largestUnmaskedK:
+                            tTMEM_LOADrS[i] = -Float32.inf
+
+                for i in cutlass.range(64, cute.size(tTMEM_LOADrS)):
+                    if i + index_k0 > largestUnmaskedK:
+                        tTMEM_LOADrS[i] = -Float32.inf
+
+        old_row_max = row_max
+        row_max = tTMEM_LOADrS.load().reduce(cute.ReductionOp.MAX, row_max, 0)
+        row_max_safe = row_max
+        if row_max == -cutlass.Float32.inf:
+            row_max_safe = 0.0
+
+        tTMEM_STORE_VECrS = cute.make_rmem_tensor(tTMEM_STORE_VECcS.shape, self.qk_acc_dtype)
+
+        tTMEM_STORE_VECrS[0] = old_row_max
+        tTMEM_STORE_VECrS[1] = row_max_safe
+        cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS, tTMEM_STORE_VECtS)
+        cute.arch.fence_view_async_tmem_store()
+        # Notify correction wg that row_max is ready
+        vec_i_handle.commit()
+
+        tTMEM_STORErS_x4 = cute.make_rmem_tensor(tTMEM_STOREcS.shape, self.qk_acc_dtype)
+        tTMEM_STORErS_x4_e = cute.make_tensor(
+            cute.recast_ptr(tTMEM_STORErS_x4.iterator, dtype=self.q_dtype),
+            tTMEM_LOADrS.layout,
+        )
+
+        scale = scale_softmax_log2
+        minus_row_max_scale = (0.0 - row_max_safe) * scale
+
+        # Sequence barrier wait
+        if cutlass.const_expr(stage == 0):
+            sequence_producer_handle = s0_s1_sequence_producer.acquire_and_advance()
+        else:
+            sequence_consumer_handle = s0_s1_sequence_consumer.wait_and_advance()
+        frg_cnt = 4
+        frg_tile = cute.size(tTMEM_LOADrS) // frg_cnt
+        tTMEM_LOADrS_frg = cute.logical_divide(tTMEM_LOADrS, cute.make_layout(frg_tile))
+        tTMEM_STORErS_x4_e_frg = cute.logical_divide(tTMEM_STORErS_x4_e, cute.make_layout(frg_tile))
+        for j in cutlass.range(frg_cnt):
+            for k in cutlass.range(0, cute.size(tTMEM_LOADrS_frg, mode=[0]), 2):
+
+                tTMEM_LOADrS_frg[k, j], tTMEM_LOADrS_frg[k + 1, j] = cute.arch.fma_packed_f32x2(
+                    (tTMEM_LOADrS_frg[k, j], tTMEM_LOADrS_frg[k + 1, j]),
+                    (scale, scale),
+                    (minus_row_max_scale, minus_row_max_scale),
+                )
+
+                tTMEM_LOADrS_frg[k, j] = cute.math.exp2(tTMEM_LOADrS_frg[k, j], fastmath=True)
+
+                tTMEM_LOADrS_frg[k + 1, j] = cute.math.exp2(tTMEM_LOADrS_frg[k + 1, j], fastmath=True)
+
+            s_vec = tTMEM_LOADrS_frg[None, j].load()
+            tTMEM_STORErS_x4_e_frg[None, j].store(s_vec.to(self.q_dtype))
+        # Sequence barrier arrive
+        if cutlass.const_expr(stage == 0):
+            sequence_producer_handle.commit()
+        else:
+            sequence_consumer_handle.release()
+        cute.copy(tiled_tmem_store, tTMEM_STORErS_x4, tTMEM_STOREtS_x4)
+        cute.arch.fence_view_async_tmem_store()
+        # Notify tensor core warp that softmax(S->P) is ready
+        si_handle.release()
+
+        vec_i_handle = si_corr_producer.acquire_and_advance()
+        acc_scale_ = scale * (old_row_max - row_max_safe)
+
+        acc_scale = cute.math.exp2(acc_scale_, fastmath=True) * 0.5
+        row_sum *= acc_scale
+
+        local_row_sum_0 = (row_sum, row_sum)
+        local_row_sum_1 = (0.0, 0.0)
+        local_row_sum_2 = (0.0, 0.0)
+        local_row_sum_3 = (0.0, 0.0)
+
+        reduction_unroll = 4
+        frg_tile = cute.size(tTMEM_LOADrS) // reduction_unroll
+        tTMEM_LOADrS_frg = cute.logical_divide(tTMEM_LOADrS, cute.make_layout(frg_tile))
+
+        for j in cutlass.range_constexpr(0, cute.size(tTMEM_LOADrS_frg, mode=[0]), 2):
+            local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, (tTMEM_LOADrS_frg[j, 0], tTMEM_LOADrS_frg[j + 1, 0]))
+            local_row_sum_1 = cute.arch.add_packed_f32x2(local_row_sum_1, (tTMEM_LOADrS_frg[j, 1], tTMEM_LOADrS_frg[j + 1, 1]))
+            local_row_sum_2 = cute.arch.add_packed_f32x2(local_row_sum_2, (tTMEM_LOADrS_frg[j, 2], tTMEM_LOADrS_frg[j + 1, 2]))
+            local_row_sum_3 = cute.arch.add_packed_f32x2(local_row_sum_3, (tTMEM_LOADrS_frg[j, 3], tTMEM_LOADrS_frg[j + 1, 3]))
+
+        local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, local_row_sum_1)
+        local_row_sum_2 = cute.arch.add_packed_f32x2(local_row_sum_2, local_row_sum_3)
+        local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, local_row_sum_2)
+        row_sum = local_row_sum_0[0] + local_row_sum_0[1]
+
+        return (
+            row_max,
+            row_sum,
+            vec_i_handle,
+            mma_si_consumer,
+            si_corr_producer,
+            s0_s1_sequence_consumer,
+            s0_s1_sequence_producer,
+        )
+
+    # For both softmax0 and softmax1 warp group
+    @cute.jit
+    def softmax(
+        self,
+        stage: int,
+        seqlen_k: Int32,
+        seqlen_q: Int32,
+        cum_seqlen_q: Optional[cute.Tensor],
+        cum_seqlen_k: Optional[cute.Tensor],
+        scale_softmax_log2: Float32,
+        qk_thr_mma: cute.core.ThrMma,
+        tStS: cute.Tensor,
+        tStSi: cute.Tensor,
+        window_size_left: Optional[Int32],
+        window_size_right: Optional[Int32],
+        mma_si_consumer: pipeline.PipelineConsumer,
+        si_corr_producer: pipeline.PipelineProducer,
+        s0_s1_sequence_consumer: pipeline.PipelineConsumer,
+        s0_s1_sequence_producer: pipeline.PipelineProducer,
+        tile_sched_params: fmha_utils.FmhaStaticTileSchedulerParams,
+    ):
+        """Compute softmax on attention scores from QK matrix multiplication.
+
+        This method handles the softmax computation for either the first or second half of the
+        attention matrix, depending on the 'stage' parameter. It calculates row-wise maximum
+        and sum values needed for stable softmax computation, applies optional masking, and
+        transforms raw attention scores into probability distributions.
+
+        The implementation uses specialized memory access patterns and efficient math operations
+        for computing exp(x) using exp2 functions. It also coordinates pipeline
+        synchronization between MMA, correction, and sequence processing stages.
+
+        :param stage: Processing stage (0 for first half, 1 for second half of attention matrix)
+        :type stage: int
+        :param seqlen_k: Length of the key sequence
+        :type seqlen_k: Int32
+        :param seqlen_q: Length of the query sequence
+        :type seqlen_q: Int32
+        :param cum_seqlen_q: Cumulative sequence lengths for queries
+        :type cum_seqlen_q: cute.Tensor | None
+        :param cum_seqlen_k: Cumulative sequence lengths for keys
+        :type cum_seqlen_k: cute.Tensor | None
+        :param scale_softmax_log2: Log2 scale factor for softmax operation
+        :type scale_softmax_log2: Float32
+        :param qk_thr_mma: Thread MMA operation for QK matrix multiplication
+        :type qk_thr_mma: cute.core.ThrMma
+        :param tStS: Shared tensor for softmax input/output
+        :type tStS: cute.Tensor
+        :param tStSi: Input tensor containing attention scores
+        :type tStSi: cute.Tensor
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+        :param mma_si_pipeline: Pipeline for synchronizing with MMA operations
+        :type mma_si_pipeline: pipeline.PipelineAsync
+        :param si_corr_pipeline: Pipeline for synchronizing with correction operations
+        :type si_corr_pipeline: pipeline.PipelineAsync
+        :param s0_s1_sequence_pipeline: Pipeline for synchronizing between stage 0 and 1
+        :type s0_s1_sequence_pipeline: pipeline.PipelineAsync
+        :param tile_sched_params: Parameters for tile scheduling
+        :type tile_sched_params: fmha_utils.FmhaStaticTileSchedulerParams
+        :param fused_mask: Compute trip counts and apply masking for attention blocks
+        :type fused_mask: fmha_utils.FusedMask
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        thread_idx = tidx % (self.threads_per_warp * (len(self.softmax0_warp_ids) if stage == 0 else len(self.softmax1_warp_ids)))
+
+        cS_base = cute.make_identity_tensor((self.qk_mma_tiler[0], self.qk_mma_tiler[1]))
+        tilePlikeFP32 = self.qk_mma_tiler[1] // 32 * self.o_dtype.width
+        tScS = qk_thr_mma.partition_C(cS_base)
+        tStS_vec_layout = cute.composition(tStS.layout, cute.make_layout((128, 2)))
+        tmem_vec_offset = self.tmem_vec0_offset if stage == 0 else self.tmem_vec1_offset
+        tStS_vec = cute.make_tensor(tStS.iterator + tmem_vec_offset, tStS_vec_layout)
+        tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2)))
+        tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout)
+        tStS_P_layout = cute.composition(tStS.layout, cute.make_layout((128, tilePlikeFP32)))
+        tmem_p_offset = self.tmem_p0_offset if stage == 0 else self.tmem_p1_offset
+        tStS_P = cute.make_tensor(tStS.iterator + tmem_p_offset, tStS_P_layout)
+        tmem_load_atom = cute.make_copy_atom(
+            tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(32)),
+            self.qk_acc_dtype,
+        )
+        tiled_tmem_load = tcgen05.make_tmem_copy(tmem_load_atom, tStSi)
+        thread_idx = tidx % (self.threads_per_warp * (len(self.softmax0_warp_ids) if stage == 0 else len(self.softmax1_warp_ids)))
+        thr_tmem_load = tiled_tmem_load.get_slice(thread_idx)
+        tTMEM_LOADtS = thr_tmem_load.partition_S(tStSi)
+        tmem_store_vec_atom = cute.make_copy_atom(
+            tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(2)),
+            self.qk_acc_dtype,
+        )
+        tiled_tmem_store_vec = tcgen05.make_tmem_copy(tmem_store_vec_atom, tStS_vec)
+        thr_tmem_store_vec = tiled_tmem_store_vec.get_slice(thread_idx)
+        tTMEM_STORE_VECtS = thr_tmem_store_vec.partition_D(tStS_vec)
+        tTMEM_STORE_VECcS = thr_tmem_store_vec.partition_S(tScS_vec)
+        tmem_store_atom = cute.make_copy_atom(
+            tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(32)),
+            self.qk_acc_dtype,
+        )
+        tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tStS_P)
+        thr_tmem_store = tiled_tmem_store.get_slice(thread_idx)
+        tTMEM_STOREtS_x4 = thr_tmem_store.partition_D(tStS_P)
+
+        tile_sched = fmha_utils.create_fmha_static_tile_scheduler(tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim())
+        work_tile = tile_sched.initial_work_tile_info()
+
+        while work_tile.is_valid_tile:
+            curr_block_coord = work_tile.tile_idx
+            batch_coord = curr_block_coord[2][1]
+            seqlen_k_ = seqlen_k
+            continue_cond = False
+            cuseqlen_q = Int32(0)
+            seqlen_q_ = seqlen_q
+            if cutlass.const_expr(cum_seqlen_q is not None):
+                cuseqlen_q = cum_seqlen_q[batch_coord]
+                seqlen_q_ = cum_seqlen_q[batch_coord + 1] - cuseqlen_q
+                continue_cond = not fmha_utils.FmhaStaticTileScheduler.check_valid_work_for_seqlen_q(
+                    self.cta_tiler[0],
+                    curr_block_coord[0],
+                    seqlen_q_,
+                )
+
+            if not continue_cond:
+                if cutlass.const_expr(cum_seqlen_k is not None):
+                    cuseqlen_k = cum_seqlen_k[batch_coord]
+                    seqlen_k_ = cum_seqlen_k[batch_coord + 1] - cuseqlen_k
+                row_max = -Float32.inf
+                row_sum = 0.0
+                value_args = (
+                    seqlen_k_,
+                    seqlen_q_,
+                    scale_softmax_log2,
+                    window_size_left,
+                    window_size_right,
+                )
+                atom_args = (
+                    qk_thr_mma,
+                    tiled_tmem_load,
+                    tiled_tmem_store,
+                    tiled_tmem_store_vec,
+                    thr_tmem_load,
+                    thr_tmem_store,
+                    thr_tmem_store_vec,
+                )
+                tensor_args = (
+                    tTMEM_LOADtS,
+                    tTMEM_STORE_VECtS,
+                    tTMEM_STOREtS_x4,
+                )
+
+                logical_offset = (
+                    curr_block_coord[0] * self.cta_tiler[0] + stage * self.qk_mma_tiler[0],
+                    0,
+                )
+                cS = cute.domain_offset(logical_offset, cS_base)
+                vec_i_handle = si_corr_producer.acquire_and_advance()
+
+                start_count = fmha_utils.FusedMask.get_trip_start(
+                    self.mask_type,
+                    curr_block_coord,
+                    self.cta_tiler,
+                    seqlen_q_,
+                    seqlen_k_,
+                    window_size_left,
+                )
+
+                leading_mask_count = fmha_utils.FusedMask.get_masked_leading_count(
+                    self.mask_type,
+                    curr_block_coord,
+                    self.cta_tiler,
+                    seqlen_q_,
+                    seqlen_k_,
+                    window_size_left,
+                    window_size_right,
+                )
+
+                for i in cutlass.range(start_count, start_count + leading_mask_count, 1, unroll=1):
+                    cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS)
+                    iter_args = (cS_iter, row_max, row_sum, vec_i_handle)
+                    pipeline_args = (
+                        mma_si_consumer,
+                        si_corr_producer,
+                        s0_s1_sequence_consumer,
+                        s0_s1_sequence_producer,
+                    )
+                    (
+                        row_max,
+                        row_sum,
+                        vec_i_handle,
+                        mma_si_consumer,
+                        si_corr_producer,
+                        s0_s1_sequence_consumer,
+                        s0_s1_sequence_producer,
+                    ) = self.softmax_step(
+                        stage,
+                        True,
+                        iter_args,
+                        value_args,
+                        pipeline_args,
+                        atom_args,
+                        tensor_args,
+                    )
+                unmask_count = fmha_utils.FusedMask.get_unmasked_trip_count(
+                    self.mask_type,
+                    curr_block_coord,
+                    self.cta_tiler,
+                    seqlen_q_,
+                    seqlen_k_,
+                    window_size_left,
+                    window_size_right,
+                )
+
+                for i in cutlass.range(
+                    start_count + leading_mask_count,
+                    start_count + leading_mask_count + unmask_count,
+                    1,
+                    unroll=1,
+                ):
+                    cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS)
+                    iter_args = (cS_iter, row_max, row_sum, vec_i_handle)
+                    pipeline_args = (
+                        mma_si_consumer,
+                        si_corr_producer,
+                        s0_s1_sequence_consumer,
+                        s0_s1_sequence_producer,
+                    )
+                    (
+                        row_max,
+                        row_sum,
+                        vec_i_handle,
+                        mma_si_consumer,
+                        si_corr_producer,
+                        s0_s1_sequence_consumer,
+                        s0_s1_sequence_producer,
+                    ) = self.softmax_step(
+                        stage,
+                        False,
+                        iter_args,
+                        value_args,
+                        pipeline_args,
+                        atom_args,
+                        tensor_args,
+                    )
+
+                trailing_mask_count = fmha_utils.FusedMask.get_masked_trailing_count(
+                    self.mask_type,
+                    curr_block_coord,
+                    self.cta_tiler,
+                    seqlen_q_,
+                    seqlen_k_,
+                    window_size_left,
+                    window_size_right,
+                )
+
+                for i in cutlass.range(
+                    start_count + leading_mask_count + unmask_count,
+                    start_count + leading_mask_count + unmask_count + trailing_mask_count,
+                    1,
+                    unroll=1,
+                ):
+                    cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS)
+                    iter_args = (cS_iter, row_max, row_sum, vec_i_handle)
+                    pipeline_args = (
+                        mma_si_consumer,
+                        si_corr_producer,
+                        s0_s1_sequence_consumer,
+                        s0_s1_sequence_producer,
+                    )
+                    (
+                        row_max,
+                        row_sum,
+                        vec_i_handle,
+                        mma_si_consumer,
+                        si_corr_producer,
+                        s0_s1_sequence_consumer,
+                        s0_s1_sequence_producer,
+                    ) = self.softmax_step(
+                        stage,
+                        True,
+                        iter_args,
+                        value_args,
+                        pipeline_args,
+                        atom_args,
+                        tensor_args,
+                    )
+                si_handle = mma_si_consumer.wait_and_advance()
+                tTMEM_STORE_VECrS = cute.make_rmem_tensor(tTMEM_STORE_VECcS.shape, self.qk_acc_dtype)
+
+                if row_sum == 0.0:
+                    row_sum = 1.0
+                if row_max == -cutlass.Float32.inf:
+                    row_max = 0.0
+
+                tTMEM_STORE_VECrS[0] = row_sum
+                tTMEM_STORE_VECrS[1] = row_max
+                cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS, tTMEM_STORE_VECtS)
+                cute.arch.fence_view_async_tmem_store()
+                vec_i_handle.commit()
+                si_corr_producer.acquire()
+                # Empty step to sync against pipe s
+                si_handle.release()
+
+            # Advance to next tile
+            tile_sched.advance_to_next_work()
+            work_tile = tile_sched.get_current_work()
+        # End of persistent scheduler loop
+
+    @cute.jit
+    def correction_rescale(
+        self,
+        thr_mma: cute.core.ThrMma,
+        tOtO: cute.Tensor,
+        scale: Float32,
+    ):
+        """Rescale intermediate attention results based on softmax normalization factor.
+
+        This method performs a crucial correction step in the attention computation pipeline.
+        When processing attention in blocks, the softmax normalization factors may change
+        as new blocks are processed. This method rescales previously computed partial
+        output values to account for updated normalization factors.
+
+        The implementation uses efficient tensor memory operations to:
+        1. Load existing partial attention output from tensor memory
+        2. Apply the scaling factor to all elements
+        3. Store the rescaled results back to tensor memory
+
+        :param thr_mma: Thread MMA operation for the computation
+        :type thr_mma: cute.core.ThrMma
+        :param tOtO: Tensor representing partial attention output to be rescaled
+        :type tOtO: cute.Tensor
+        :param scale: Scaling factor to apply to the partial results
+        :type scale: Float32
+        """
+
+        pv_tiled_mma_shape = (
+            self.pv_mma_tiler[0],
+            self.pv_mma_tiler[1],
+        )
+        cO = cute.make_identity_tensor(pv_tiled_mma_shape)
+        tOcO = thr_mma.partition_C(cO)
+
+        corr_tile_size = 16  # tuneable parameter
+        tmem_load_atom = cute.make_copy_atom(
+            tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(corr_tile_size)),
+            self.pv_acc_dtype,
+        )
+        tmem_store_atom = cute.make_copy_atom(
+            tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(corr_tile_size)),
+            self.pv_acc_dtype,
+        )
+
+        tOtO_i_layout = cute.composition(tOtO.layout, cute.make_layout((128, corr_tile_size)))
+        tOcO_i_layout = cute.composition(tOcO.layout, cute.make_layout((128, corr_tile_size)))
+
+        tOtO_i = cute.make_tensor(tOtO.iterator, tOtO_i_layout)
+        tOcO_i = cute.make_tensor(tOcO.iterator, tOcO_i_layout)
+
+        tiled_tmem_load = tcgen05.make_tmem_copy(tmem_load_atom, tOtO_i)
+        tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tOtO_i)
+        tidx, _, _ = cute.arch.thread_idx()
+        thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids))
+        thr_tmem_load = tiled_tmem_load.get_slice(thread_idx)
+        thr_tmem_store = tiled_tmem_store.get_slice(thread_idx)
+
+        tTMEM_LOADtO = thr_tmem_load.partition_S(tOtO_i)
+        tTMEM_LOADcO = thr_tmem_load.partition_D(tOcO_i)
+
+        tTMEM_STOREtO = thr_tmem_store.partition_D(tOtO_i)
+
+        tTMrO = cute.make_rmem_tensor((tTMEM_LOADcO.shape, 128 // corr_tile_size), self.pv_acc_dtype)
+        for i in cutlass.range(self.cta_tiler[2] // corr_tile_size):
+            tTMrO_i_ = tTMrO[None, i]
+            tTMrO_i_layout = cute.composition(tTMrO_i_.layout, cute.make_layout(tTMrO.shape[0]))
+            tTMrO_i = cute.make_tensor(tTMrO_i_.iterator, tTMrO_i_layout)
+            tTMEM_LOADtO_i = cute.make_tensor(tTMEM_LOADtO.iterator + i * corr_tile_size, tTMEM_LOADtO.layout)
+            tTMEM_STOREtO_i = cute.make_tensor(tTMEM_STOREtO.iterator + i * corr_tile_size, tTMEM_STOREtO.layout)
+
+            cute.copy(tiled_tmem_load, tTMEM_LOADtO_i, tTMrO_i)
+            for j in cutlass.range(0, cute.size(tTMrO_i), 2):
+                tTMrO_i[j], tTMrO_i[j + 1] = cute.arch.mul_packed_f32x2(
+                    (tTMrO_i[j], tTMrO_i[j + 1]),
+                    (scale, scale),
+                )
+            cute.copy(tiled_tmem_store, tTMrO_i, tTMEM_STOREtO_i)
+
+    @cute.jit
+    def correction_epilog(
+        self,
+        thr_mma: cute.core.ThrMma,
+        tOtO: cute.Tensor,
+        mLSE: cute.Tensor | None,
+        tTMEM_LOAD_VECrS: cute.Tensor,
+        row_idx: Int32,
+        cuseqlen_q: Int32,
+        seqlen_q: Int32,
+        blk_coord: Int32,
+        scale_softmax: Float32,
+        scale: Float32,
+        sO: cute.Tensor,
+    ):
+        """Apply final scaling and transformation to attention output before writing to global memory.
+
+        This correction_epilog function handles the final processing step for attention output values.
+        It applies a scaling factor to the accumulated attention results and prepares the
+        data for efficient transfer back to global memory.
+
+        The method performs:
+        1. Loading of accumulated attention results from tensor memory
+        2. Application of the final output scaling factor
+        3. Type conversion if necessary (typically from higher precision accumulator to output precision)
+        4. Reorganization of data for optimal memory access patterns
+        5. Preparation for efficient TMA store operations
+
+        :param thr_mma: Thread MMA operation for the computation
+        :type thr_mma: cute.core.ThrMma
+        :param tOtO: Tensor containing accumulated attention output
+        :type tOtO: cute.Tensor
+        :param mLSE: Tensor containing log-sum-exp values for LSE calculation
+        :type mLSE: cute.Tensor | None
+        :param tTMEM_LOAD_VECrS: Tensor containing row sum and max values for softmax calculation
+        :type tTMEM_LOAD_VECrS: cute.Tensor
+        :param row_idx: Index of the current row being processed
+        :type row_idx: Int32
+        :param cuseqlen_q: Cumulative sequence length of the current query
+        :type cuseqlen_q: Int32
+        :param seqlen_q: Sequence length of the current query
+        :type seqlen_q: Int32
+        :param blk_coord: Coordinate of the current block being processed
+        :type blk_coord: Int32
+        :param scale_softmax: Scaling factor for softmax calculation
+        :type scale_softmax: Float32
+        :param scale: Final scaling factor to apply to the output
+        :type scale: Float32
+        :param sO: Shared memory tensor for the final output
+        :type sO: cute.Tensor
+        """
+
+        pv_tiled_mma_shape = (
+            self.pv_mma_tiler[0],
+            self.pv_mma_tiler[1],
+        )
+        cO = cute.make_identity_tensor(pv_tiled_mma_shape)
+
+        corr_tile_size = 32 * 8 // self.o_dtype.width
+        tOsO = thr_mma.partition_C(sO)
+        tOcO = thr_mma.partition_C(cO)
+
+        tOtO_i = cute.logical_divide(tOtO, cute.make_layout((128, corr_tile_size)))
+        tOcO_i = cute.logical_divide(tOcO, cute.make_layout((128, corr_tile_size)))
+        tOsO_i = cute.logical_divide(tOsO, cute.make_layout((128, corr_tile_size)))
+        tidx, _, _ = cute.arch.thread_idx()
+        thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids))
+
+        epi_subtile = (self.epi_tile[0], corr_tile_size)
+        tmem_copy_atom = sm100_utils.get_tmem_load_op(
+            self.pv_mma_tiler,
+            self.o_layout,
+            self.o_dtype,
+            self.pv_acc_dtype,
+            epi_subtile,
+            use_2cta_instrs=False,
+        )
+
+        tiled_tmem_load = tcgen05.make_tmem_copy(tmem_copy_atom, tOtO_i[(None, None), 0])
+
+        thr_tmem_load = tiled_tmem_load.get_slice(thread_idx)
+        smem_copy_atom = sm100_utils.get_smem_store_op(self.o_layout, self.o_dtype, self.pv_acc_dtype, tiled_tmem_load)
+        tiled_smem_store = cute.make_tiled_copy_D(smem_copy_atom, tiled_tmem_load)
+
+        tTMEM_LOADtO = thr_tmem_load.partition_S(tOtO_i[(None, None), None])
+        tTMEM_LOADsO = thr_tmem_load.partition_D(tOsO_i[(None, None), None])
+        tTMEM_LOADoO = thr_tmem_load.partition_D(tOcO_i[(None, None), None])
+
+        for i in cutlass.range(self.cta_tiler[2] // corr_tile_size):
+            tTMEM_LOADtO_i = tTMEM_LOADtO[None, 0, 0, i]
+            tTMEM_LOADsO_i = tTMEM_LOADsO[None, 0, 0, i]
+            tTMrO = cute.make_rmem_tensor(tTMEM_LOADoO[None, 0, 0, i].shape, self.pv_acc_dtype)
+            cute.copy(tiled_tmem_load, tTMEM_LOADtO_i, tTMrO)
+            for j in cutlass.range(0, cute.size(tTMrO), 2):
+
+                tTMrO[j], tTMrO[j + 1] = cute.arch.mul_packed_f32x2(
+                    (tTMrO[j], tTMrO[j + 1]),
+                    (scale, scale),
+                )
+
+            tSMrO = cute.make_rmem_tensor(tTMrO.shape, self.o_dtype)
+            o_vec = tTMrO.load()
+            tSMrO.store(o_vec.to(self.o_dtype))
+            cute.copy(tiled_smem_store, tSMrO, tTMEM_LOADsO_i)
+
+        if cutlass.const_expr(mLSE is not None):
+            lse = cute.math.log(tTMEM_LOAD_VECrS[0], fastmath=True) + scale_softmax * tTMEM_LOAD_VECrS[1]
+            if row_idx < seqlen_q:
+                mLSE[row_idx + cuseqlen_q, blk_coord[2]] = lse
+
+        # fence view async shared
+        cute.arch.fence_proxy(
+            cute.arch.ProxyKind.async_shared,
+            space=cute.arch.SharedSpace.shared_cta,
+        )
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/fmha_helpers.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/fmha_helpers.py
new file mode 100644
index 00000000..d0e89630
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/compression/fmha_helpers.py
@@ -0,0 +1,843 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from typing import Tuple, Optional
+import cutlass
+from cutlass.cute.typing import Boolean
+
+from cutlass.cutlass_dsl import (
+    Int32,
+    Float32,
+    min,
+    extract_mlir_values,
+    new_from_mlir_values,
+)
+from cutlass.utils.hardware_info import HardwareInfo
+from cutlass.utils import WorkTileInfo
+import cutlass.cute as cute
+
+##############################################################################
+# Fmha static tile scheduler
+##############################################################################
+
+
+class FmhaStaticTileSchedulerParams:
+    """A class to represent parameters for the FMHA (Fused Multi-Head Attention) static tile scheduler.
+
+    This class holds the configuration parameters needed to initialize and configure
+    the tile scheduler for FMHA operations.
+
+    :ivar is_persistent: Whether to use persistent kernel mode.
+    :type is_persistent: bool
+    :ivar problem_shape_mbh: Problem shape in (M, B, H) format.
+    :type problem_shape_mbh: cute.Shape
+    """
+
+    def __init__(
+        self,
+        is_persistent: bool,
+        problem_shape_mbh: cute.Shape,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        Initializes the FmhaStaticTileSchedulerParams with the given parameters.
+
+        :param is_persistent: Whether to use persistent kernel mode.
+        :type is_persistent: bool
+        :param problem_shape_mbh: Problem shape in (M, B, H) format.
+        :type problem_shape_mbh: cute.Shape
+        """
+        self.is_persistent = is_persistent
+        self.problem_shape_mbh = problem_shape_mbh
+        self._loc = loc
+        self._ip = ip
+
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.problem_shape_mbh]:
+            obj_values = extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip([self.problem_shape_mbh], self._values_pos):
+            obj_list.append(new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return FmhaStaticTileSchedulerParams(self.is_persistent, *(tuple(obj_list)), loc=self._loc)
+
+
+class FmhaStaticTileScheduler:
+    """A static tile scheduler for FMHA (Fused Multi-Head Attention) operations.
+
+    This class manages the scheduling of work tiles for FMHA kernels, supporting
+    both persistent and non-persistent kernel modes. It tracks the current work
+    position and advances through the problem space efficiently.
+
+    :ivar _params: Scheduler parameters.
+    :type _params: FmhaStaticTileSchedulerParams
+    :ivar _blk_coord: Block coordinates.
+    :type _blk_coord: cute.Coord
+    :ivar _grid_shape: Grid shape for the kernel.
+    :type _grid_shape: cute.Shape
+    :ivar _is_persistent: Whether to use persistent kernel mode.
+    :type _is_persistent: bool
+    :ivar _current_work_linear_idx: Current linear work index.
+    :type _current_work_linear_idx: Int32
+    :ivar _problem_shape_mbh: Problem shape in (M, B, H) format.
+    :type _problem_shape_mbh: cute.Layout
+    :ivar _num_blocks: Number of blocks in the problem.
+    :type _num_blocks: Int32
+    :ivar _is_first_block: Whether this is the first block.
+    :type _is_first_block: bool
+    :ivar num_persistent_sm: Number of persistent SMs.
+    :type num_persistent_sm: Int32
+    """
+
+    def __init__(
+        self,
+        params: FmhaStaticTileSchedulerParams,
+        current_work_linear_idx: Int32,
+        blk_coord: cute.Coord,
+        grid_shape: cute.Shape,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        Initializes the FmhaStaticTileScheduler with the given parameters.
+
+        :param params: Scheduler parameters.
+        :type params: FmhaStaticTileSchedulerParams
+        :param current_work_linear_idx: Current linear work index.
+        :type current_work_linear_idx: Int32
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param grid_shape: Grid shape for the kernel.
+        :type grid_shape: cute.Shape
+        """
+        self._params = params
+        self._blk_coord = blk_coord
+        self._grid_shape = grid_shape
+        self._is_persistent = params.is_persistent
+        self._current_work_linear_idx = current_work_linear_idx
+        self._problem_shape_mbh = cute.make_layout(params.problem_shape_mbh, loc=loc, ip=ip)
+        self._num_blocks = cute.size(self._problem_shape_mbh, loc=loc, ip=ip)
+        self._is_first_block = True
+        self.num_persistent_sm = cute.size(grid_shape, loc=loc, ip=ip)
+        self._loc = loc
+        self._ip = ip
+
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: FmhaStaticTileSchedulerParams,
+        *,
+        loc=None,
+        ip=None,
+    ) -> cute.Shape:
+        """
+        Determine the grid shape for the FMHA kernel.
+
+        For persistent kernels, the grid shape is limited by the number of SMs
+        (Streaming Multiprocessors) available on the device. For non-persistent
+        kernels, the grid shape matches the problem shape.
+
+        :param params: Scheduler parameters.
+        :type params: FmhaStaticTileSchedulerParams
+
+        :return: Grid shape as (M, B, H) tuple.
+        :rtype: cute.Shape
+        """
+        if params.is_persistent:
+            hardware_info = HardwareInfo()
+            sm_count = hardware_info.get_device_multiprocessor_count()
+            return (
+                min(sm_count, cute.size(params.problem_shape_mbh, loc=loc, ip=ip)),
+                1,
+                1,
+            )
+        else:
+            return params.problem_shape_mbh
+
+    @staticmethod
+    def check_valid_work_for_seqlen_q(
+        q_tiler: int,
+        current_idx: Int32,
+        seqlen_q: Int32,
+    ) -> Boolean:
+        """
+        Check if the current work index is valid for the given query sequence length.
+
+        This method verifies that the current work tile index multiplied by the
+        query tiler size is within the bounds of the query sequence length.
+
+        :param q_tiler: Query tiler size.
+        :type q_tiler: int
+        :param current_idx: Current work index.
+        :type current_idx: Int32
+        :param seqlen_q: Query sequence length.
+        :type seqlen_q: Int32
+
+        :return: True if the work is valid, False otherwise.
+        :rtype: Boolean
+        """
+        return current_idx * q_tiler < seqlen_q
+
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        """
+        Get information about the current work tile.
+
+        Determines if the current work is valid and computes the tile coordinates
+        based on whether the kernel is persistent or non-persistent.
+
+        :return: WorkTileInfo containing tile coordinates and validity flag.
+        :rtype: WorkTileInfo
+        """
+        is_valid = self._current_work_linear_idx < self._num_blocks if self._is_persistent else self._is_first_block
+
+        blk_coord = (0, 0, 0)
+        if self._is_persistent:
+            blk_coord = self._problem_shape_mbh.get_hier_coord(self._current_work_linear_idx, loc=loc, ip=ip)
+        else:
+            blk_coord = self._blk_coord
+
+        # cur_tile_coord is (mid, 0, (bid, hid))
+        cur_tile_coord = (
+            blk_coord[0],
+            0,
+            (blk_coord[1], blk_coord[2]),
+        )
+
+        return WorkTileInfo(cur_tile_coord, is_valid)
+
+    def initial_work_tile_info(self, *, loc=None, ip=None):
+        """
+        Get the initial work tile information.
+
+        :return: Initial WorkTileInfo.
+        :rtype: WorkTileInfo
+        """
+        return self.get_current_work(loc=loc, ip=ip)
+
+    def advance_to_next_work(self, *, advance_count=1, loc=None, ip=None):
+        """
+        Advance to the next work tile.
+
+        For persistent kernels, advances by the number of persistent SMs.
+        For non-persistent kernels, marks that the first block has been processed.
+
+        :param advance_count: Number of steps to advance (default: 1).
+        :type advance_count: int
+        """
+        if self._is_persistent:
+            self._current_work_linear_idx += advance_count * self.num_persistent_sm
+        self._is_first_block = False
+
+    def __extract_mlir_values__(self):
+        values = extract_mlir_values(self._params)
+        values.extend(extract_mlir_values(self._current_work_linear_idx))
+        values.extend(extract_mlir_values(self._blk_coord))
+        values.extend(extract_mlir_values(self._grid_shape))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 10
+        new_params = new_from_mlir_values(self._params, values[0:3])
+        new_current_work_linear_idx = new_from_mlir_values(self._current_work_linear_idx, [values[3]])
+        new_blk_coord = new_from_mlir_values(self._blk_coord, values[4:7])
+        new_grid_shape = new_from_mlir_values(self._grid_shape, values[7:])
+        return FmhaStaticTileScheduler(new_params, new_current_work_linear_idx, new_blk_coord, new_grid_shape)
+
+
+def create_fmha_static_tile_scheduler(
+    params: FmhaStaticTileSchedulerParams,
+    blk_coord: cute.Coord,
+    grid_shape: cute.Shape,
+) -> FmhaStaticTileScheduler:
+    """
+    Create a new FMHA static tile scheduler.
+
+    :param params: Scheduler parameters.
+    :type params: FmhaStaticTileSchedulerParams
+    :param blk_coord: Block coordinates.
+    :type blk_coord: cute.Coord
+    :param grid_shape: Grid shape.
+    :type grid_shape: cute.Shape
+
+    :return: New FmhaStaticTileScheduler instance.
+    :rtype: FmhaStaticTileScheduler
+    """
+    return FmhaStaticTileScheduler(params, blk_coord[0], blk_coord, grid_shape)
+
+
+def create_fmha_static_tile_scheduler_params(
+    is_persistent: bool,
+    problem_shape_mbh: cute.Shape,
+) -> FmhaStaticTileSchedulerParams:
+    """
+    Create FMHA static tile scheduler parameters.
+
+    :param is_persistent: Whether to use persistent kernel mode.
+    :type is_persistent: bool
+    :param problem_shape_mbh: Problem shape in (M, B, H) format.
+    :type problem_shape_mbh: cute.Shape
+
+    :return: New FmhaStaticTileSchedulerParams instance.
+    :rtype: FmhaStaticTileSchedulerParams
+    """
+    return FmhaStaticTileSchedulerParams(is_persistent, problem_shape_mbh)
+
+
+def compute_grid(
+    o_shape: cute.Shape,
+    cta_tiler: Tuple[int, int, int],
+    is_persistent: bool,
+) -> Tuple[FmhaStaticTileSchedulerParams, Tuple[int, int, int]]:
+    """
+    Compute grid parameters for FMHA operation.
+
+    This function calculates the appropriate grid shape and scheduler parameters
+    based on the output tensor shape, CTA (Cooperative Thread Array) tiler,
+    and whether to use persistent kernel mode.
+
+    The output tensor o has shape (s, d, ((h_r, h_k), b)) where:
+    - s: sequence length
+    - d: head dimension
+    - h_r: number of heads for query
+    - h_k: number of heads for key
+    - b: batch size
+
+    :param o_shape: Output tensor shape for grid computation.
+    :type o_shape: cute.Shape
+    :param cta_tiler: CTA tiler dimensions (M, N, K).
+    :type cta_tiler: Tuple[int, int, int]
+    :param is_persistent: Whether to use persistent kernel mode.
+    :type is_persistent: bool
+
+    :return: Tuple of (scheduler_params, grid_shape).
+    :rtype: Tuple[FmhaStaticTileSchedulerParams, Tuple[int, int, int]]
+    """
+    tile_sched_params = create_fmha_static_tile_scheduler_params(
+        is_persistent,
+        (
+            cute.ceil_div(cute.size(o_shape[0]), cta_tiler[0]),
+            cute.size(o_shape[2][0]),
+            cute.size(o_shape[2][1]),
+        ),
+    )
+    grid = FmhaStaticTileScheduler.get_grid_shape(tile_sched_params)
+
+    return tile_sched_params, grid
+
+
+##############################################################################
+# Fused Mask
+##############################################################################
+
+
+class MaskType(enum.Enum):
+    """Enumeration of mask types for FMHA operations.
+
+    - RESIDUAL_MASK: Residual mask for handling variable sequence lengths
+    - WINDOW_MASK: Window mask for attention which also includes causal and no mask
+    - WINDOW_MASK_INFERENCE: Same as the window mask, but has the limitation that the end of q is aligned with the end of k
+    """
+
+    RESIDUAL_MASK = enum.auto()
+    WINDOW_MASK = enum.auto()
+    WINDOW_MASK_INFERENCE = enum.auto()
+    COMPRESSED_CAUSAL_MASK = enum.auto()
+
+
+class FusedMask:
+    """A fused mask implementation for FMHA operations.
+
+    This class handles different types of attention masks including no mask,
+    residual mask for variable sequence lengths, and causal mask for
+    autoregressive attention patterns.
+
+    The class provides methods to:
+    - Calculate trip counts for different mask types
+    - Apply masks to attention scores
+    - Handle masked and unmasked trip calculations
+    """
+
+    @cute.jit
+    def get_trip_count(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
+    ) -> Int32:
+        """
+        Calculate the number of trips needed for the current block.
+
+        The trip count depends on the mask type and the block coordinates.
+        For causal masks, it considers the autoregressive constraint.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+
+        :return: Number of trips needed.
+        :rtype: Int32
+        """
+        result = 0
+
+        offset = 0 if cutlass.const_expr(mask_type is not MaskType.WINDOW_MASK_INFERENCE) else seqlen_k - seqlen_q
+        if cutlass.const_expr(mask_type == MaskType.RESIDUAL_MASK):
+            result = cute.ceil_div(seqlen_k, tile_shape[1])
+        if cutlass.const_expr(mask_type == MaskType.WINDOW_MASK or mask_type == MaskType.WINDOW_MASK_INFERENCE):
+            if cutlass.const_expr(window_size_right is None):
+                result = cute.ceil_div(seqlen_k, tile_shape[1])
+            else:
+                max_idx_q = (blk_coord[0] + 1) * tile_shape[0]
+                idx_k = max_idx_q + offset + window_size_right
+                tmp_blocks_k = cute.ceil_div(idx_k, tile_shape[1])
+                max_blocks_k = cute.ceil_div(seqlen_k, tile_shape[1])
+                result = min(max_blocks_k, tmp_blocks_k)
+        elif cutlass.const_expr(mask_type == MaskType.COMPRESSED_CAUSAL_MASK):
+            compression_factor = seqlen_q // seqlen_k
+
+            block_end = (blk_coord[0] + 1) * tile_shape[0] - 1 + offset + window_size_right
+
+            tmp_blocks_k = cute.ceil_div(((block_end + 1) // compression_factor), tile_shape[1])
+
+            max_blocks_k = cute.ceil_div(seqlen_k, tile_shape[1])
+            result = max(1, min(max_blocks_k, tmp_blocks_k))
+
+        start_block = FusedMask.get_trip_start(mask_type, blk_coord, tile_shape, seqlen_q, seqlen_k, window_size_left)
+        result = result - start_block
+
+        return result
+
+    @cute.jit
+    def get_trip_start(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+    ) -> Int32:
+        """
+        Get the start of the trip for the current block.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+        """
+        result = 0
+        offset = 0 if cutlass.const_expr(mask_type is not MaskType.WINDOW_MASK_INFERENCE) else seqlen_k - seqlen_q
+        if cutlass.const_expr(window_size_left is not None):
+            min_idx_q = blk_coord[0] * tile_shape[0]
+            idx_k = min_idx_q + offset - window_size_left
+            tmp_blocks_k = idx_k // tile_shape[1]
+            result = max(tmp_blocks_k, result)
+        return result
+
+    @cute.jit
+    def get_leading_mask_id(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
+    ) -> Tuple[Int32, Int32]:
+        """
+        Get the begin and end tile idx for the leading mask.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+
+        :return: Tuple of (begin, end) tile idx for the leading mask.
+        :rtype: Tuple[Int32, Int32]
+        """
+        offset = 0 if cutlass.const_expr(mask_type is not MaskType.WINDOW_MASK_INFERENCE) else seqlen_k - seqlen_q
+        leading_mask_begin = FusedMask.get_trip_start(mask_type, blk_coord, tile_shape, seqlen_q, seqlen_k, window_size_left)
+        trip_count = FusedMask.get_trip_count(
+            mask_type,
+            blk_coord,
+            tile_shape,
+            seqlen_q,
+            seqlen_k,
+            window_size_left,
+            window_size_right,
+        )
+        min_idx_q = (blk_coord[0] + 1) * tile_shape[0] + offset - window_size_left
+        leading_mask_end = min(max(min_idx_q // tile_shape[1], 0), trip_count + leading_mask_begin - 1)
+        return leading_mask_begin, leading_mask_end
+
+    @cute.jit
+    def get_trailing_mask_id(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
+    ) -> Tuple[Int32, Int32]:
+        """
+        Get the begin and end tile idx for the trailing mask.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+
+        :return: Tuple of (begin, end) tile idx for the trailing mask.
+        :rtype: Tuple[Int32, Int32]
+        """
+        offset = 0 if cutlass.const_expr(mask_type is not MaskType.WINDOW_MASK_INFERENCE) else seqlen_k - seqlen_q
+        trip_start = FusedMask.get_trip_start(mask_type, blk_coord, tile_shape, seqlen_q, seqlen_k, window_size_left)
+        trip_count = FusedMask.get_trip_count(
+            mask_type,
+            blk_coord,
+            tile_shape,
+            seqlen_q,
+            seqlen_k,
+            window_size_left,
+            window_size_right,
+        )
+        min_idx_q = blk_coord[0] * tile_shape[0] + offset + window_size_right
+        trailing_mask_begin = max(min(min_idx_q // tile_shape[1], trip_count + trip_start - 1), 0)
+        trailing_mask_end = trip_count + trip_start - 1
+        return trailing_mask_begin, trailing_mask_end
+
+    @cute.jit
+    def get_masked_leading_count(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
+    ) -> Int32:
+        """
+        Calculate the number of masked trips for the leading mask.
+
+        This is used for blocks that need special handling due to masking.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+
+        :return: Number of masked trips.
+        :rtype: Int32
+        """
+        result = 0
+        if cutlass.const_expr(window_size_left is not None):
+            leading_mask_begin, leading_mask_end = FusedMask.get_leading_mask_id(
+                mask_type,
+                blk_coord,
+                tile_shape,
+                seqlen_q,
+                seqlen_k,
+                window_size_left,
+                window_size_right,
+            )
+            result = leading_mask_end - leading_mask_begin + 1
+
+        return result
+
+    @cute.jit
+    def get_masked_trailing_count(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
+        rem_count: Optional[Int32] = 0,
+    ) -> Int32:
+        """
+        Calculate the number of masked trips for the trailing mask.
+
+        This is used for blocks that need special handling due to masking.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+        :param rem_count: Remaining count from previous calculations.
+        :type rem_count: Int32
+
+        :return: Number of masked trips.
+        :rtype: Int32
+        """
+        result = 0
+
+        if cutlass.const_expr(mask_type == MaskType.WINDOW_MASK or mask_type == MaskType.WINDOW_MASK_INFERENCE):
+            if cutlass.const_expr(window_size_right is not None):
+                trailing_mask_begin, trailing_mask_end = FusedMask.get_trailing_mask_id(
+                    mask_type,
+                    blk_coord,
+                    tile_shape,
+                    seqlen_q,
+                    seqlen_k,
+                    window_size_left,
+                    window_size_right,
+                )
+                if cutlass.const_expr(window_size_left is not None):
+                    leading_mask_begin, leading_mask_end = FusedMask.get_leading_mask_id(
+                        mask_type,
+                        blk_coord,
+                        tile_shape,
+                        seqlen_q,
+                        seqlen_k,
+                        window_size_left,
+                        window_size_right,
+                    )
+                    if trailing_mask_begin <= leading_mask_end:
+                        result = trailing_mask_end - leading_mask_end
+                    else:
+                        result = trailing_mask_end - trailing_mask_begin + 1
+                else:
+                    result = trailing_mask_end - trailing_mask_begin + 1
+
+        elif cutlass.const_expr(mask_type == MaskType.RESIDUAL_MASK):
+            if seqlen_k % tile_shape[1] != 0:
+                result = 1
+            else:
+                result = 0
+
+        elif mask_type == MaskType.COMPRESSED_CAUSAL_MASK:
+            compression_factor = seqlen_q // seqlen_k
+
+            block_start = blk_coord[0] * tile_shape[0]
+            block_end = min(seqlen_q, (blk_coord[0] + 1) * tile_shape[0] - 1)
+
+            trip_count = FusedMask.get_trip_count(
+                mask_type,
+                blk_coord,
+                tile_shape,
+                seqlen_q,
+                seqlen_k,
+                window_size_left,
+                window_size_right,
+            )
+
+            mask_start_trip = ((block_start + 1) // compression_factor) // tile_shape[1]
+            mask_end_trip = ((block_end + 1) // compression_factor) // tile_shape[1]
+
+            result = max(mask_end_trip - mask_start_trip, trip_count)
+
+        return result + rem_count
+
+    @cute.jit
+    def get_unmasked_trip_count(
+        mask_type: MaskType,
+        blk_coord: cute.Coord,
+        tile_shape: cute.Shape,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[Int32] = None,
+        window_size_right: Optional[Int32] = None,
+    ) -> Int32:
+        """
+        Calculate the number of unmasked trips for the current block.
+
+        This represents the number of trips that don't require special
+        masking treatment.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param blk_coord: Block coordinates.
+        :type blk_coord: cute.Coord
+        :param tile_shape: Shape of the tile.
+        :type tile_shape: cute.Shape
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Int32
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[Int32]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[Int32]
+
+        :return: Number of unmasked trips.
+        :rtype: Int32
+        """
+        result = (
+            FusedMask.get_trip_count(
+                mask_type,
+                blk_coord,
+                tile_shape,
+                seqlen_q,
+                seqlen_k,
+                window_size_left,
+                window_size_right,
+            )
+            - FusedMask.get_masked_leading_count(
+                mask_type,
+                blk_coord,
+                tile_shape,
+                seqlen_q,
+                seqlen_k,
+                window_size_left,
+                window_size_right,
+            )
+            - FusedMask.get_masked_trailing_count(
+                mask_type,
+                blk_coord,
+                tile_shape,
+                seqlen_q,
+                seqlen_k,
+                window_size_left,
+                window_size_right,
+                0,
+            )
+        )
+        return result
+
+    @cute.jit
+    def apply_mask(
+        mask_type: MaskType,
+        acc_qk: cute.Tensor,
+        index_qk: cute.Tensor,
+        seqlen_q: Int32,
+        seqlen_k: Int32,
+        window_size_left: Optional[int] = None,
+        window_size_right: Optional[int] = None,
+    ):
+        """
+        Apply the appropriate mask to the attention scores.
+
+        This method modifies the attention scores (acc_qk) based on the mask type
+        and the positions in the index tensor.
+
+        :param mask_type: Type of mask to use
+        :type mask_type: utils.MaskType
+        :param acc_qk: Accumulated QK attention scores tensor.
+        :type acc_qk: cute.Tensor
+        :param index_qk: Index tensor containing position information.
+        :type index_qk: cute.Tensor
+        :param seqlen_k: Key sequence length for attention computation.
+        :type seqlen_k: Int32
+        :param seqlen_q: Query sequence length for attention computation.
+        :type seqlen_q: Optional[int]
+        :param window_size_left: Left-side sliding window size for attention masking.
+        :type window_size_left: Optional[int]
+        :param window_size_right: Right-side sliding window size for attention masking.
+        :type window_size_right: Optional[int]
+        """
+
+        offset = 0
+        if cutlass.const_expr(mask_type == MaskType.WINDOW_MASK_INFERENCE):
+            offset = seqlen_k - seqlen_q
+        for i in cutlass.range(cute.size(acc_qk)):
+            index_q, index_k = index_qk[i]
+            if cutlass.const_expr(window_size_left is not None or window_size_right is not None):
+                if cutlass.const_expr(mask_type == MaskType.COMPRESSED_CAUSAL_MASK):
+                    compression_factor = seqlen_q // seqlen_k
+                    if (index_q + 1) // compression_factor - 1 < index_k or index_k >= seqlen_k:
+                        acc_qk[i] = -Float32.inf
+                    if index_k >= seqlen_k or index_q >= seqlen_q:  # residual mask
+                        acc_qk[i] = -Float32.inf
+                elif cutlass.const_expr(window_size_left is None):
+                    if index_q + offset + window_size_right < index_k:
+                        acc_qk[i] = -Float32.inf
+                    if index_k >= seqlen_k or index_q >= seqlen_q:  # residual mask
+                        acc_qk[i] = -Float32.inf
+                elif cutlass.const_expr(window_size_right is None):
+                    if index_q + offset - window_size_left > index_k:
+                        acc_qk[i] = -Float32.inf
+                    if index_k >= seqlen_k or index_q >= seqlen_q:  # residual mask
+                        acc_qk[i] = -Float32.inf
+                else:
+                    max_K_index = min(index_q + offset + window_size_right, seqlen_k)
+                    min_K_index = max(0, index_q + offset - window_size_left)
+                    if index_k > max_K_index or index_k < min_K_index:
+                        acc_qk[i] = -Float32.inf
+                    if index_k >= seqlen_k or index_q >= seqlen_q:  # residual mask
+                        acc_qk[i] = -Float32.inf
+
+            if cutlass.const_expr(mask_type == MaskType.RESIDUAL_MASK):
+                if index_k >= seqlen_k or index_q >= seqlen_q:
+                    acc_qk[i] = -Float32.inf
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/NSA_select_attn_fwd_hmma.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/NSA_select_attn_fwd_hmma.py
new file mode 100644
index 00000000..3b9db9dd
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/NSA_select_attn_fwd_hmma.py
@@ -0,0 +1,1085 @@
+from typing import Callable, Union
+import math
+
+import numpy as np
+import argparse
+import torch
+import cuda.bindings.driver as cuda
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+import cutlass.utils.hopper_helpers as sm90_utils
+from cutlass.cute.runtime import from_dlpack
+from cutlass._mlir.dialects import llvm
+
+"""
+A NSA(Native Sparse Attention) attention forward pass example for NVIDIA Ampere SM90 architecture using Cute DSL.
+
+There are some constraints for this example:
+* Only Float16 and BFloat16 are supported.
+* Accumulation type is Float32.
+* Supported block sizes(16, 32, 64) combined with GQA group sizes(1, 2, 4, 8, 32, 64) 
+"""
+
+
+class HopperSelectAttentionFwd:
+    def __init__(
+        self,
+        head_dim: int,
+        value_dim: int,
+        GQA_group_size: int,
+        block_size: int,
+        dtype: type[cutlass.Numeric],
+        acc_dtype: type[cutlass.Numeric],
+    ):
+        self.dtype = dtype
+        self.acc_dtype = acc_dtype
+        self.atom_layout_mnk = (1, 1, 1)
+        self.block_size = block_size
+
+        assert self.dtype in [cutlass.Float16, cutlass.BFloat16]
+        assert self.acc_dtype in [cutlass.Float16, cutlass.BFloat16, cutlass.Float32]
+        assert self.block_size % 8 == 0 and self.block_size >= 16 and self.block_size <= 128, "block_size should be a multiple of 8 and >= 16 and <= 128"
+
+        self.K_stage = 1
+        self.V_stage = 1
+        self.epi_stage = 3
+        self.threads_per_block = 32
+        self.mma_warp_groups = 1  # math.prod((1, 1, 1))
+
+        self.qk_dim = head_dim
+        self.value_dim = value_dim
+        self.tile_shape_mnk_QK = (16, self.block_size, self.qk_dim)
+        self.tile_shape_mnk_PV = (16, self.value_dim, self.block_size)
+        self.epi_tile = (16, min(32, self.tile_shape_mnk_PV[1]))
+        self.GQA_group_size = GQA_group_size
+        self.log2_e = 1.4426950408889634074
+
+        assert self.GQA_group_size <= 16, "GQA_group_size should be less than or equal to 16"
+
+    @cute.jit
+    def __call__(
+        self,
+        Q: cute.Tensor,
+        K: cute.Tensor,
+        V: cute.Tensor,
+        O: cute.Tensor,
+        L: cute.Tensor,
+        M: cute.Tensor,
+        block_indices: cute.Tensor,
+        block_counts: cute.Tensor,
+        max_length: int,
+        seq_offsets: cute.Tensor,
+        softmax_scale: cutlass.Float32,
+        stream: cuda.CUstream,
+    ):
+        """
+        Args:
+            Q (cute.Tensor):
+                Queries of shape `[G, K, B * T, H]`
+            K (cute.Tensor):
+                Keys of shape `[B*T, K, H]`
+            V (cute.Tensor):
+                Values of shape `[K, B*T, H]`
+            O (cute.Tensor):
+                Output of shape `[G, V, B * T, H]`
+            L (cute.Tensor):
+                Logits of shape `[G, T, H]`
+            M (cute.Tensor):
+                Mask of shape `[G, T, H]`
+            block_indices (cute.Tensor):
+                Selected block indices of shape `[B, S, H, TopK]`
+            block_counts (cute.Tensor):
+                Selected block counts of shape `[B, S, H]`
+            block_size (cute.constexpr):
+                Block size
+            stream (cuda.CUstream):
+                CUDA stream
+        """
+
+        self.Q_layout = utils.LayoutEnum.from_tensor(Q)
+        self.K_layout = utils.LayoutEnum.from_tensor(K)
+        self.V_layout = utils.LayoutEnum.from_tensor(V)
+        self.O_layout = utils.LayoutEnum.from_tensor(O)
+
+        self.Q_dtype = Q.element_type
+        self.K_dtype = K.element_type
+        self.V_dtype = V.element_type
+        self.O_dtype = O.element_type
+
+        if cutlass.const_expr(self.Q_dtype.width != self.K_dtype.width):
+            raise TypeError(f"Type width mismatch: {self.Q_dtype.width} != {self.K_dtype.width}")
+
+        mma_n_itr = self.block_size // 8
+        tiled_mma_QK = cute.make_tiled_mma(
+            cute.nvgpu.warp.MmaF16BF16Op(self.Q_dtype, self.acc_dtype, (16, 8, 16)),
+            (1, self.threads_per_block // 32, 1),
+            permutation_mnk=(16, self.threads_per_block // 32 * 8 * mma_n_itr, 16),
+        )
+
+        tiled_mma_PV = cute.make_tiled_mma(
+            cute.nvgpu.warp.MmaF16BF16Op(self.Q_dtype, self.acc_dtype, (16, 8, 16)),
+            (1, self.threads_per_block // 32, 1),
+            permutation_mnk=(16, self.threads_per_block // 32 * 8 * 8, 16),
+        )
+
+        tma_op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp()
+
+        # qkv smem layout definition
+        Q_smem_shape = (self.tile_shape_mnk_QK[0], self.tile_shape_mnk_QK[2])
+        Q_smem_layout_atom = cute.nvgpu.warpgroup.make_smem_layout_atom(
+            # sm90_utils.get_smem_layout_atom(self.Q_layout, self.Q_dtype, Q_smem_shape[1]), # K-major by default
+            cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER,
+            self.Q_dtype,
+        )
+        assert self.Q_layout.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K, "Q_layout should be K-major"
+        Q_smem_layout_staged = cute.tile_to_shape(
+            Q_smem_layout_atom,
+            cute.append(Q_smem_shape, 1),
+            order=(0, 1, 2),  # K-major by default
+        )
+
+        K_smem_shape = (self.block_size, self.tile_shape_mnk_QK[2])
+        K_smem_layout_atom = cute.nvgpu.warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(self.K_layout, self.K_dtype, K_smem_shape[1]),  # K-major by default
+            # cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER,
+            self.K_dtype,
+        )
+        assert self.K_layout.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K, "K_layout should be K-major"
+        K_smem_layout_staged = cute.tile_to_shape(
+            K_smem_layout_atom,
+            cute.append(K_smem_shape, self.K_stage),
+            order=(0, 1, 2),  # K-major by default
+        )
+
+        V_smem_shape = (self.tile_shape_mnk_PV[2], self.tile_shape_mnk_PV[1])
+        V_smem_layout_atom = cute.nvgpu.warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(self.V_layout, self.V_dtype, V_smem_shape[1]),  # K-major by default
+            # cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER,
+            self.V_dtype,
+        )
+
+        assert self.V_layout.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K, "V_layout should be K-major"
+        V_smem_layout_staged = cute.tile_to_shape(
+            V_smem_layout_atom,
+            cute.append(V_smem_shape, self.V_stage),
+            order=(0, 1, 2),  # K-major by default
+        )
+
+        # import pdb; pdb.set_trace()
+        V_layout_atom = sm90_utils.get_smem_layout_atom(self.V_layout, self.V_dtype, V_smem_shape[1])
+
+        if cutlass.const_expr(V_layout_atom == cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW128):
+            Vt_layout_atom = cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW128
+        elif cutlass.const_expr(V_layout_atom == cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW64):
+            Vt_layout_atom = cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW64
+        elif cutlass.const_expr(V_layout_atom == cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW32):
+            Vt_layout_atom = cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW32
+        elif cutlass.const_expr(V_layout_atom == cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER):
+            Vt_layout_atom = cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_INTER
+        else:
+            raise ValueError(f"Unsupported V_layout_atom: {V_layout_atom}")
+
+        Vt_smem_shape = (self.tile_shape_mnk_PV[1], self.tile_shape_mnk_PV[2])
+        Vt_smem_layout_atom = cute.nvgpu.warpgroup.make_smem_layout_atom(
+            # sm90_utils.get_smem_layout_atom(self.Vt_layout, self.V_dtype, Vt_smem_shape[0]), # K-major by default
+            Vt_layout_atom,
+            self.V_dtype,
+        )
+        Vt_smem_layout_staged = cute.tile_to_shape(
+            Vt_smem_layout_atom,
+            cute.append(Vt_smem_shape, self.V_stage),
+            order=(1, 0, 2),  # K-major by default
+        )
+
+        O_smem_shape = self.epi_tile
+        O_smem_layout_atom = cute.nvgpu.warpgroup.make_smem_layout_atom(
+            sm90_utils.get_smem_layout_atom(self.O_layout, self.O_dtype, O_smem_shape[1]),  # K-major by default
+            self.O_dtype,
+        )
+        O_smem_layout_staged = cute.tile_to_shape(
+            O_smem_layout_atom,
+            cute.append(O_smem_shape, self.epi_stage),
+            order=(1, 0, 2) if self.O_layout.is_m_major_c() else (0, 1, 2),
+        )
+
+        smem_layout_Q = cute.slice_(Q_smem_layout_staged, (None, None, 0))
+        tma_atom_Q, tma_tensor_Q = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            tma_op,
+            Q,
+            smem_layout_Q,
+            (self.tile_shape_mnk_QK[0], self.tile_shape_mnk_QK[2]),
+            num_multicast=1,
+        )
+        smem_layout_K = cute.slice_(K_smem_layout_staged, (None, None, 0))
+        tma_atom_K, tma_tensor_K = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            tma_op,
+            K,
+            smem_layout_K,
+            (self.tile_shape_mnk_QK[1], self.tile_shape_mnk_QK[2]),
+            num_multicast=1,
+        )
+        smem_layout_V = cute.slice_(V_smem_layout_staged, (None, None, 0))
+        tma_atom_V, tma_tensor_V = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            tma_op,
+            V,
+            smem_layout_V,
+            (self.tile_shape_mnk_PV[2], self.tile_shape_mnk_PV[1]),
+            num_multicast=1,
+        )
+
+        smem_layout_O = cute.slice_(O_smem_layout_staged, (None, None, 0))
+        O_cta_v_layout = cute.composition(cute.make_identity_layout(O.shape), self.epi_tile)
+        tma_atom_O, tma_tensor_O = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            cute.nvgpu.cpasync.CopyBulkTensorTileS2GOp(),
+            O,
+            smem_layout_O,
+            O_cta_v_layout,
+        )
+
+        L_smem_shape = (self.tile_shape_mnk_QK[0], 1)
+        L_smem_layout = cute.make_layout(shape=L_smem_shape, stride=(1, self.tile_shape_mnk_QK[0]))
+        M_smem_shape = (self.tile_shape_mnk_QK[0], 1)
+        M_smem_layout = cute.make_layout(shape=M_smem_shape, stride=(1, self.tile_shape_mnk_QK[0]))
+
+        BUFFER_ALIGN_BYTES = 128
+
+        @cute.struct
+        class SharedStorageShare:
+            mainloop_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, self.K_stage * 2]
+            mainloop_pipeline_v_array_ptr: cute.struct.MemRange[cutlass.Int64, self.V_stage * 2]
+            prefetchQ_pipeline_array_ptr: cute.struct.MemRange[cutlass.Int64, 1 * 2]
+            sQ: cute.struct.Align[
+                cute.struct.MemRange[self.Q_dtype, cute.cosize(Q_smem_layout_staged)],
+                BUFFER_ALIGN_BYTES,
+            ]
+            sK: cute.struct.Align[
+                cute.struct.MemRange[self.K_dtype, cute.cosize(K_smem_layout_staged)],
+                BUFFER_ALIGN_BYTES,
+            ]
+            sV: cute.struct.Align[
+                cute.struct.MemRange[self.V_dtype, cute.cosize(V_smem_layout_staged)],
+                BUFFER_ALIGN_BYTES,
+            ]
+            sIDX: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, block_indices.shape[2]],
+                BUFFER_ALIGN_BYTES,
+            ]
+
+        assert cute.cosize(Q_smem_layout_staged) + cute.cosize(K_smem_layout_staged) + cute.cosize(V_smem_layout_staged) >= cute.cosize(
+            O_smem_layout_staged
+        ), "shared storage size is not enough for so"
+        self.shared_storage = SharedStorageShare
+
+        grid_dim = (
+            max_length,
+            K.layout.shape[2],
+            seq_offsets.shape[0] - 1,
+        )  # max_length, head_num_kv, batch_size
+        # grid_dim = (1, 1, 1)
+        block_dim = [self.threads_per_block, 1, 1]  # one warpgroup per CTA
+        cta_layout_mnk = cute.make_layout((1, 1, 1))
+
+        self.kernel(
+            tma_atom_Q,
+            tma_tensor_Q,
+            tma_atom_K,
+            tma_tensor_K,
+            tma_atom_V,
+            tma_tensor_V,
+            tma_atom_O,
+            tma_tensor_O,
+            L,
+            L_smem_layout,
+            M,
+            M_smem_layout,
+            seq_offsets,
+            block_indices,
+            block_counts,
+            tiled_mma_QK,
+            tiled_mma_PV,
+            cta_layout_mnk,
+            Q_smem_layout_staged,
+            K_smem_layout_staged,
+            V_smem_layout_staged,
+            Vt_smem_layout_staged,
+            O_smem_layout_staged,
+            softmax_scale,
+        ).launch(
+            grid=grid_dim,
+            block=block_dim,
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+        )
+
+    def _threadquad_reduce(self, val: cutlass.Float32, op: Callable, mask: int) -> cutlass.Float32:
+        """thread quad reduction
+
+        :param val: register value
+        :type val: cutlass.Float32
+        :param op: binary operator
+        :type op: Callable
+        :return: reduced value
+        :rtype: cutlass.Float32
+        """
+        val = op(
+            val,
+            cute.arch.shuffle_sync_bfly(val, offset=2, mask=mask, mask_and_clamp=31),
+        )
+        val = op(
+            val,
+            cute.arch.shuffle_sync_bfly(val, offset=1, mask=mask, mask_and_clamp=31),
+        )
+        return val
+
+    def _threadquad_reduce_max(self, val: cutlass.Float32, mask: int) -> cutlass.Float32:
+        """thread quad reduction max
+
+        :param val: register value
+        :type val: cutlass.Float32
+        :return: max value
+        :rtype: cutlass.Float32
+        """
+        return self._threadquad_reduce(val, lambda x, y: cute.arch.fmax(x, y), mask)
+
+    def _threadquad_reduce_sum(self, val: cutlass.Float32, mask: int) -> cutlass.Float32:
+        """thread quad reduction sum
+
+        :param val: register value
+        :type val: cutlass.Float32
+        :return: sum value
+        :rtype: cutlass.Float32
+        """
+        return self._threadquad_reduce(val, lambda x, y: x + y, mask)
+
+    def _make_acc_tensor_mn_view(self, acc: cute.Tensor) -> cute.Tensor:
+        """make acc tensor as mn layout view
+
+        :param acc: input tensor
+        :type acc: cute.Tensor
+        :return: acc tensor mn layout view
+        :rtype: cute.Tensor
+        """
+        acc_layout_col_major = cute.make_layout(acc.layout.shape)
+        acc_layout_mn = cute.make_layout(
+            (
+                (
+                    acc_layout_col_major.shape[0][1],
+                    acc_layout_col_major.shape[1],
+                ),  # MMA_M
+                (
+                    acc_layout_col_major.shape[0][0],
+                    acc_layout_col_major.shape[2],
+                ),  # MMA_N
+            ),
+            stride=(
+                (
+                    acc_layout_col_major.stride[0][1],
+                    acc_layout_col_major.stride[1],
+                ),  # MMA_M
+                (
+                    acc_layout_col_major.stride[0][0],
+                    acc_layout_col_major.stride[2],
+                ),  # MMA_N
+            ),
+        )
+        acc_layout_mn = cute.composition(acc.layout, acc_layout_mn)
+        return cute.make_tensor(acc.iterator, acc_layout_mn)
+
+    @cute.jit
+    def _exp2f(self, x: Union[cute.TensorSSA, cutlass.Float32]) -> Union[cute.TensorSSA, cutlass.Float32]:
+        """exp2f calculation for both vector and scalar.
+
+        :param x: input value
+        :type x: cute.TensorSSA or cutlass.Float32
+        :return: exp2 value
+        :rtype: cute.TensorSSA or cutlass.Float32
+        """
+        if cutlass.const_expr(isinstance(x, cute.TensorSSA)):
+            res = cute.make_rmem_tensor(x.shape, cutlass.Float32)
+            res.store(x)
+
+            for i in cutlass.range_constexpr(cute.size(x.shape)):
+                res[i] = self._exp2f(res[i])
+
+            return res.load()
+        return cute.math.exp2(x, fastmath=True)
+
+    @cute.kernel
+    def kernel(
+        self,
+        tma_atom_Q: cute.CopyAtom,
+        mQ: cute.Tensor,
+        tma_atom_K: cute.CopyAtom,
+        mK: cute.Tensor,
+        tma_atom_V: cute.CopyAtom,
+        mV: cute.Tensor,
+        tma_atom_O: cute.CopyAtom,
+        mO: cute.Tensor,
+        mL: cute.Tensor,
+        L_smem_layout: cute.Layout,
+        mM: cute.Tensor,
+        M_smem_layout: cute.Layout,
+        seq_offsets: cute.Tensor,
+        block_indices: cute.Tensor,
+        block_counts: cute.Tensor,
+        tiled_mma_QK: cute.TiledMma,
+        tiled_mma_PV: cute.TiledMma,
+        cta_layout_mnk: cute.Layout,
+        Q_smem_layout_staged: cute.ComposedLayout,
+        K_smem_layout_staged: cute.ComposedLayout,
+        V_smem_layout_staged: cute.ComposedLayout,
+        Vt_smem_layout_staged: cute.ComposedLayout,
+        O_smem_layout_staged: cute.ComposedLayout,
+        softmax_scale: cutlass.Float32,
+    ):
+        """
+        GPU device kernel performing the batched NSA computation.
+        """
+
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+        if warp_idx == 0:
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_Q)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_K)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_O)
+
+        t, KV_head_idx, offset_idx = cute.arch.block_idx()
+        tidx, _, _ = cute.arch.thread_idx()
+
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        cluster_coord_mnk = cta_layout_mnk.get_flat_coord(cta_rank_in_cluster)
+
+        smem = cutlass.utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        Q_smem_layout = cute.slice_(Q_smem_layout_staged, (None, None, 0))
+        K_smem_layout = cute.slice_(K_smem_layout_staged, (None, None, 0))
+        V_smem_layout = cute.slice_(V_smem_layout_staged, (None, None, 0))
+
+        K_tma_copy_bytes = cute.size_in_bytes(self.K_dtype, K_smem_layout)
+        # one consumer
+        consumer_arrive_cnt = self.threads_per_block // 32
+        mainloop_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        mainloop_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, consumer_arrive_cnt)
+        mainloop_pipeline_array_ptr = storage.mainloop_pipeline_array_ptr.data_ptr()
+        mainloop_pipeline = pipeline.PipelineTmaAsync.create(
+            barrier_storage=mainloop_pipeline_array_ptr,
+            num_stages=self.K_stage,
+            producer_group=mainloop_pipeline_producer_group,
+            consumer_group=mainloop_pipeline_consumer_group,
+            tx_count=K_tma_copy_bytes,
+        )
+
+        Q_tma_copy_bytes = cute.size_in_bytes(self.Q_dtype, Q_smem_layout)
+        prefetchQ_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        prefetchQ_pipeline_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, consumer_arrive_cnt)
+        prefetchQ_pipeline_array_ptr = storage.prefetchQ_pipeline_array_ptr.data_ptr()
+        prefetchQ_pipeline = pipeline.PipelineTmaAsync.create(
+            barrier_storage=prefetchQ_pipeline_array_ptr,
+            num_stages=1,
+            producer_group=prefetchQ_pipeline_producer_group,
+            consumer_group=prefetchQ_pipeline_consumer_group,
+            tx_count=Q_tma_copy_bytes,
+        )
+
+        V_tma_copy_bytes = cute.size_in_bytes(self.V_dtype, V_smem_layout)
+        mainloop_pipeline_producer_group_v = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        mainloop_pipeline_consumer_group_v = pipeline.CooperativeGroup(pipeline.Agent.Thread, consumer_arrive_cnt)
+        mainloop_pipeline_array_ptr_v = storage.mainloop_pipeline_v_array_ptr.data_ptr()
+        mainloop_pipeline_V = pipeline.PipelineTmaAsync.create(
+            barrier_storage=mainloop_pipeline_array_ptr_v,
+            num_stages=self.V_stage,
+            producer_group=mainloop_pipeline_producer_group_v,
+            consumer_group=mainloop_pipeline_consumer_group_v,
+            tx_count=V_tma_copy_bytes,
+        )
+
+        sQ = storage.sQ.get_tensor(Q_smem_layout_staged.outer, swizzle=Q_smem_layout_staged.inner)
+        sK = storage.sK.get_tensor(K_smem_layout_staged.outer, swizzle=K_smem_layout_staged.inner)
+        sV = storage.sV.get_tensor(V_smem_layout_staged.outer, swizzle=V_smem_layout_staged.inner)
+        sVt = storage.sV.get_tensor(Vt_smem_layout_staged.outer, swizzle=Vt_smem_layout_staged.inner)
+        sO = storage.sQ.get_tensor(O_smem_layout_staged.outer, swizzle=O_smem_layout_staged.inner)  # sO shared with sK
+        sIDX = storage.sIDX.get_tensor(block_indices.shape[2])
+
+        smem_copy_atom_Q = cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=2),
+            self.Q_dtype,
+        )
+        smem_copy_atom_K = cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=4),
+            self.K_dtype,
+        )
+        smem_copy_atom_V = cute.make_copy_atom(
+            cute.nvgpu.warp.LdMatrix8x8x16bOp(transpose=True, num_matrices=4),
+            self.V_dtype,
+        )
+
+        smem_tiled_copy_Q = cute.make_tiled_copy_A(smem_copy_atom_Q, tiled_mma_QK)
+        smem_tiled_copy_K = cute.make_tiled_copy_B(smem_copy_atom_K, tiled_mma_QK)
+        smem_tiled_copy_V = cute.make_tiled_copy_B(smem_copy_atom_V, tiled_mma_PV)
+
+        smem_thr_copy_Q = smem_tiled_copy_Q.get_slice(tidx)
+        smem_thr_copy_K = smem_tiled_copy_K.get_slice(tidx)
+        smem_thr_copy_V = smem_tiled_copy_V.get_slice(tidx)
+
+        seq_len = seq_offsets[offset_idx + 1] - seq_offsets[offset_idx]
+        offset = seq_offsets[offset_idx]
+        for i in cutlass.range((block_indices.shape[2] + self.threads_per_block - 1) // self.threads_per_block):
+            if i * self.threads_per_block + tidx < block_indices.shape[2]:
+                sIDX[i * self.threads_per_block + tidx] = block_indices[offset + t, KV_head_idx, i * self.threads_per_block + tidx]
+        cute.arch.sync_threads()
+
+        seq_len_aligned = (seq_len + self.tile_shape_mnk_QK[1] - 1) // self.tile_shape_mnk_QK[1] * self.tile_shape_mnk_QK[1]
+
+        mQ_offset = cute.domain_offset((0, 0, offset, 0), mQ)
+        mQ = cute.make_tensor(
+            mQ_offset.iterator,
+            cute.make_layout(
+                shape=(mQ.shape[0], mQ.shape[1], seq_len_aligned, mQ.shape[3]),
+                stride=mQ.stride,
+            ),
+        )
+        mK_offset = cute.domain_offset((offset, 0, 0), mK)
+        mK = cute.make_tensor(
+            mK_offset.iterator,
+            cute.make_layout(shape=(seq_len_aligned, mK.shape[1], mK.shape[2]), stride=mK.stride),
+        )
+        mV_offset = cute.domain_offset((offset, 0, 0), mV)  # `[K, B*T, H]`
+        mV = cute.make_tensor(
+            mV_offset.iterator,
+            cute.make_layout(shape=(seq_len_aligned, mV.shape[1], mV.shape[2]), stride=mV.stride),
+        )
+        mO_offset = cute.domain_offset((0, 0, offset, 0), mO)
+        mO = cute.make_tensor(
+            mO_offset.iterator,
+            cute.make_layout(
+                shape=(mO.shape[0], mO.shape[1], seq_len_aligned, mO.shape[3]),
+                stride=mO.stride,
+            ),
+        )
+        mL_offset = cute.domain_offset((0, offset, 0), mL)
+        mL = cute.make_tensor(
+            mL_offset.iterator,
+            cute.make_layout(shape=(mL.shape[0], seq_len_aligned, mL.shape[2]), stride=mL.stride),
+        )
+        mM_offset = cute.domain_offset((0, offset, 0), mM)
+        mM = cute.make_tensor(
+            mM_offset.iterator,
+            cute.make_layout(shape=(mM.shape[0], seq_len_aligned, mM.shape[2]), stride=mM.stride),
+        )
+
+        if t < seq_len:
+            # (M, K)
+            gQ = cute.local_tile(
+                mQ[None, None, t, KV_head_idx],
+                tiler=(self.tile_shape_mnk_QK[0], self.tile_shape_mnk_QK[2]),
+                coord=(0, None),
+            )
+            # (n, K, loopK)
+            gK = cute.local_tile(
+                mK[None, None, KV_head_idx],
+                tiler=(self.tile_shape_mnk_QK[1], self.tile_shape_mnk_QK[2]),
+                coord=(None, 0),
+            )
+            # (K, n, loopK)
+            gV = cute.local_tile(
+                mV[None, None, KV_head_idx],
+                tiler=(self.tile_shape_mnk_PV[2], self.tile_shape_mnk_PV[1]),
+                coord=(None, 0),
+            )
+            # (M, n, loopK)
+            gO = cute.local_tile(
+                mO[None, None, t, KV_head_idx],
+                tiler=(self.tile_shape_mnk_PV[0], self.tile_shape_mnk_PV[1]),
+                coord=(0, 0),
+            )
+            # (M, ) where M=64, block_size=128 not supported yet
+
+            min_row = int(min(self.tile_shape_mnk_QK[0], self.GQA_group_size))
+            gL = cute.local_tile(
+                mL[None, t, KV_head_idx],
+                tiler=(min_row,),
+                coord=(None,),
+            )
+            gM = cute.local_tile(
+                mM[None, t, KV_head_idx],
+                tiler=(min_row,),
+                coord=(None,),
+            )
+
+            thr_mma_QK = tiled_mma_QK.get_slice(tidx)
+
+            q_cta_layout = cute.make_layout(cute.slice_(cta_layout_mnk, (0, None, 0)).shape)
+            q_cta_crd = cluster_coord_mnk[1]
+            sQ_for_tma_partition = cute.group_modes(sQ, 0, 2)
+            gQ_for_tma_partition = cute.group_modes(gQ, 0, 2)
+
+            tQsQ, tQgQ = cute.nvgpu.cpasync.tma_partition(
+                tma_atom_Q,
+                q_cta_crd,
+                q_cta_layout,
+                sQ_for_tma_partition,
+                gQ_for_tma_partition,
+            )
+
+            K_cta_layout = cute.make_layout(cute.slice_(cta_layout_mnk, (None, 0, 0)).shape)
+            k_cta_crd = cluster_coord_mnk[0]
+            sK_for_tma_partition = cute.group_modes(sK, 0, 2)
+            gK_for_tma_partition = cute.group_modes(gK, 0, 2)
+            tKsK, tKgK = cute.nvgpu.cpasync.tma_partition(
+                tma_atom_K,
+                k_cta_crd,
+                K_cta_layout,
+                sK_for_tma_partition,
+                gK_for_tma_partition,
+            )
+
+            v_cta_layout = cute.make_layout(cute.slice_(cta_layout_mnk, (None, 0, 0)).shape)
+            v_cta_crd = cluster_coord_mnk[0]
+            sV_for_tma_partition = cute.group_modes(sV, 0, 2)
+            gV_for_tma_partition = cute.group_modes(gV, 0, 2)
+            tVsV, tVgV = cute.nvgpu.cpasync.tma_partition(
+                tma_atom_V,
+                v_cta_crd,
+                v_cta_layout,
+                sV_for_tma_partition,
+                gV_for_tma_partition,
+            )
+
+            tSrQ = thr_mma_QK.make_fragment_A(thr_mma_QK.partition_A(sQ))
+            tSrK = thr_mma_QK.make_fragment_B(thr_mma_QK.partition_B(sK))
+            tSrQ_copy_view = smem_thr_copy_Q.retile(tSrQ)
+            tSrK_copy_view = smem_thr_copy_K.retile(tSrK)
+            tSsQ = smem_thr_copy_Q.partition_S(sQ)
+            tSsK = smem_thr_copy_K.partition_S(sK)
+            # import ipdb; ipdb.set_trace()
+
+            acc_shape_QK = thr_mma_QK.partition_shape_C((self.tile_shape_mnk_QK[0], self.tile_shape_mnk_QK[1]))
+            acc_QK = cute.make_rmem_tensor(acc_shape_QK, self.acc_dtype)
+            acc_QK.fill(0)
+
+            mainloop_producer_state_K = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.K_stage)
+
+            prefetchQ_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, 1)
+
+            mainloop_producer_state_V = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.V_stage)
+            mainloop_consumer_read_state_V = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.V_stage)
+            mainloop_consumer_release_state_V = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.V_stage)
+
+            mainloop_consumer_read_state_K = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.K_stage)
+
+            mainloop_consumer_release_state_K = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.K_stage)
+
+            prefetchQ_consumer_read_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, 1)
+            prefetchQ_consumer_release_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, 1)
+
+            # ********************
+            # softmax intermediate result
+            # ********************
+
+            # shape:(mmaSahpeM * mma_m)
+            row_max = cute.make_rmem_tensor((acc_shape_QK[0][0] * acc_shape_QK[1]), cutlass.Float32)
+            row_sum = cute.make_rmem_tensor((acc_shape_QK[0][0] * acc_shape_QK[1]), cutlass.Float32)
+            row_max.fill(-cutlass.Float32.inf)
+            row_sum.fill(0.0)
+
+            # ********************
+            # prefetch TMA load
+            # ********************
+            K_tile_cnt = block_counts[offset + t, KV_head_idx]
+            prefetch_K_tile_cnt = cutlass.max(cutlass.min(self.K_stage, K_tile_cnt), 0)
+            prefetch_V_tile_cnt = cutlass.max(cutlass.min(self.V_stage - 1, K_tile_cnt), 0)
+
+            if warp_idx == 0:
+                prefetchQ_pipeline.producer_acquire(prefetchQ_producer_state)
+                tAgQ_k = tQgQ[(None, 0)]
+                tAsQ_pipe = tQsQ[(None, 0)]
+                cute.copy(
+                    tma_atom_Q,
+                    tAgQ_k,
+                    tAsQ_pipe,
+                    tma_bar_ptr=prefetchQ_pipeline.producer_get_barrier(prefetchQ_producer_state),
+                )
+                prefetchQ_pipeline.producer_commit(prefetchQ_producer_state)
+                prefetchQ_producer_state.advance()
+
+                for prefetch_idx in cutlass.range(prefetch_K_tile_cnt, unroll=1):
+                    mainloop_pipeline.producer_acquire(mainloop_producer_state_K)
+                    # block_idx = block_indices[offset + t, KV_head_idx, K_tile_cnt - mainloop_producer_state_K.count - 1]
+                    # block_idx = mainloop_producer_state_K.count
+                    block_idx = sIDX[K_tile_cnt - mainloop_producer_state_K.count - 1]
+                    tKgK_k = tKgK[(None, block_idx)]
+                    tKsK_pipe = tKsK[(None, mainloop_producer_state_K.index)]
+
+                    cute.copy(
+                        tma_atom_K,
+                        tKgK_k,
+                        tKsK_pipe,
+                        tma_bar_ptr=mainloop_pipeline.producer_get_barrier(mainloop_producer_state_K),
+                    )
+
+                    mainloop_pipeline.producer_commit(mainloop_producer_state_K)
+                    mainloop_producer_state_K.advance()
+
+                for prefetch_idx in cutlass.range(prefetch_V_tile_cnt, unroll=1):
+                    mainloop_pipeline_V.producer_acquire(mainloop_producer_state_V)
+                    # block_idx = block_indices[offset + t, KV_head_idx, K_tile_cnt - mainloop_producer_state_V.count - 1]
+                    # block_idx = mainloop_producer_state_V.count
+                    block_idx = sIDX[K_tile_cnt - mainloop_producer_state_V.count - 1]
+                    tVgV_k = tVgV[(None, block_idx)]
+                    tVsV_pipe = tVsV[(None, mainloop_producer_state_V.index)]
+                    cute.copy(
+                        tma_atom_V,
+                        tVgV_k,
+                        tVsV_pipe,
+                        tma_bar_ptr=mainloop_pipeline_V.producer_get_barrier(mainloop_producer_state_V),
+                    )
+                    mainloop_pipeline_V.producer_commit(mainloop_producer_state_V)
+                    mainloop_producer_state_V.advance()
+                    # if tidx == 0:
+                    #     cute.printf("prefetch_idx: %d, block_idx: %d, mainloop_producer_state_V.index: %d\n", prefetch_idx, block_idx, mainloop_producer_state_V.index)
+
+            peek_q_full_status = cutlass.Boolean(1)
+            peek_q_full_status = prefetchQ_pipeline.consumer_try_wait(prefetchQ_consumer_read_state)
+
+            # tiled_mma_QK.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+            num_K_blocks = cute.size(tSrQ, mode=[2])
+
+            # ********************
+            # mainloop
+            # ********************
+
+            thr_mma_PV = tiled_mma_PV.get_slice(tidx)
+            acc_shape_PV = thr_mma_PV.partition_shape_C((self.tile_shape_mnk_PV[0], self.tile_shape_mnk_PV[1]))
+
+            acc_PV = cute.make_rmem_tensor(acc_shape_PV, self.acc_dtype)
+            acc_PV.fill(0)
+            acc_QK_mn = self._make_acc_tensor_mn_view(acc_QK)
+            acc_PV_mn = self._make_acc_tensor_mn_view(acc_PV)
+
+            # predicates for GQA_group_size
+            cLM = cute.make_identity_tensor((16, 1))
+            cLM_thr = tiled_mma_QK.get_slice(tidx).partition_C(cLM)
+            gL_thr = tiled_mma_QK.get_slice(tidx).partition_C(gL)
+            gM_thr = tiled_mma_QK.get_slice(tidx).partition_C(gM)
+
+            prefetchQ_pipeline.consumer_wait(prefetchQ_consumer_read_state, peek_q_full_status)
+            for k in cutlass.range_constexpr(0, cute.size(tSrQ, mode=[2])):
+                cute.copy(
+                    smem_tiled_copy_Q,
+                    tSsQ[None, None, k, 0],
+                    tSrQ_copy_view[None, None, k, 0],
+                )
+
+            peak_k_full_status = cutlass.Boolean(1)
+            if prefetch_K_tile_cnt > 0:
+                peak_k_full_status = mainloop_pipeline.consumer_try_wait(mainloop_consumer_read_state_K)
+
+            for K_tile in cutlass.range(0, K_tile_cnt, 1, unroll=1):
+                mainloop_pipeline.consumer_wait(mainloop_consumer_read_state_K, peak_k_full_status)
+                acc_QK.fill(0)
+                # cute.nvgpu.warpgroup.fence()
+                # if tidx == 0:
+                #     cute.printf("K_tile: %d, mainloop_consumer_read_state_K.index: %d\n", K_tile, mainloop_consumer_read_state_K.index)
+
+                # read V to shared memory
+                if warp_idx == 0 and mainloop_producer_state_V.count < K_tile_cnt:
+                    mainloop_pipeline_V.producer_acquire(mainloop_producer_state_V)
+                    # block_idx = block_indices[offset + t, KV_head_idx, K_tile_cnt - mainloop_producer_state_V.count - 1]
+                    # block_idx = mainloop_producer_state_V.count
+                    block_idx = sIDX[K_tile_cnt - mainloop_producer_state_V.count - 1]
+                    tVgV_k = tVgV[(None, block_idx)]
+                    tVsV_pipe = tVsV[(None, mainloop_producer_state_V.index)]
+                    cute.copy(
+                        tma_atom_V,
+                        tVgV_k,
+                        tVsV_pipe,
+                        tma_bar_ptr=mainloop_pipeline_V.producer_get_barrier(mainloop_producer_state_V),
+                    )
+                    mainloop_pipeline_V.producer_commit(mainloop_producer_state_V)
+                    mainloop_producer_state_V.advance()
+
+                cute.copy(
+                    smem_tiled_copy_K,
+                    tSsK[None, None, 0, mainloop_consumer_read_state_K.index],
+                    tSrK_copy_view[None, None, 0, mainloop_consumer_read_state_K.index],
+                )
+
+                for k in cutlass.range_constexpr(0, cute.size(tSrQ, mode=[2]), unroll=True):
+                    if k < cute.size(tSrK, mode=[2]) - 1:
+                        cute.copy(
+                            smem_tiled_copy_K,
+                            tSsK[None, None, k + 1, mainloop_consumer_read_state_K.index],
+                            tSrK_copy_view[None, None, k + 1, mainloop_consumer_read_state_K.index],
+                        )
+
+                    cute.gemm(
+                        tiled_mma_QK,
+                        acc_QK,
+                        tSrQ[None, None, k, 0],
+                        tSrK[None, None, k, mainloop_consumer_read_state_K.index],
+                        acc_QK,
+                    )
+
+                mainloop_pipeline.consumer_release(mainloop_consumer_release_state_K)
+                mainloop_consumer_read_state_K.advance()
+                mainloop_consumer_release_state_K.advance()
+
+                # read K to shared memory
+                if warp_idx == 0 and mainloop_producer_state_K.count < K_tile_cnt:
+                    mainloop_pipeline.producer_acquire(mainloop_producer_state_K)
+                    # block_idx = block_indices[offset + t, KV_head_idx, K_tile_cnt - mainloop_producer_state_K.count - 1]
+                    # block_idx = mainloop_producer_state_K.count
+                    block_idx = sIDX[K_tile_cnt - mainloop_producer_state_K.count - 1]
+                    tKgK_k = tKgK[(None, block_idx)]
+                    tKsK_pipe = tKsK[(None, mainloop_producer_state_K.index)]
+
+                    cute.copy(
+                        tma_atom_K,
+                        tKgK_k,
+                        tKsK_pipe,
+                        tma_bar_ptr=mainloop_pipeline.producer_get_barrier(mainloop_producer_state_K),
+                    )
+
+                    mainloop_pipeline.producer_commit(mainloop_producer_state_K)
+                    mainloop_producer_state_K.advance()
+
+                # ///////////////////////////////////////////////////////////////////////////////
+                # softmax
+                # ///////////////////////////////////////////////////////////////////////////////
+                is_not_first_n_block = K_tile > 0
+                row_max_prev = cute.make_fragment_like(row_max, cutlass.Float32)
+                if is_not_first_n_block:  # not first n block
+                    cute.basic_copy(row_max, row_max_prev)
+                for r in cutlass.range_constexpr(cute.size(gL_thr.shape[0][1])):
+                    if cute.elem_less(cLM_thr[(0, r), 0, 0][0], self.GQA_group_size):
+                        acc_QK_row = acc_QK_mn[r, None].load() * softmax_scale
+                        row_max_cur_row = acc_QK_row.reduce(cute.ReductionOp.MAX, -cutlass.Float32.inf, 0)
+                        row_max_cur_row = self._threadquad_reduce_max(row_max_cur_row, mask=(1 << self.GQA_group_size) - 1)
+
+                        row_max_prev_row = row_max_prev[r]
+                        if is_not_first_n_block:
+                            row_max_cur_row = cute.arch.fmax(row_max_prev_row, row_max_cur_row)
+
+                        acc_QK_row_exp = cute.TensorSSA(  # e^{Sn-mn}
+                            self._exp2f((acc_QK_row - row_max_cur_row) * self.log2_e),
+                            tuple(acc_QK_row.shape),
+                            cutlass.Float32,
+                        )
+                        acc_QK_row_sum = acc_QK_row_exp.reduce(cute.ReductionOp.ADD, cutlass.Float32.zero, 0)
+                        acc_QK_row_sum = self._threadquad_reduce_sum(acc_QK_row_sum, mask=(1 << self.GQA_group_size) - 1)  # rowsum(e^{Sn-mn})
+                        if is_not_first_n_block:
+                            prev_minus_cur_exp = self._exp2f((row_max_prev_row - row_max_cur_row) * self.log2_e)  # e^{M^{(n-1)} - M^{(n)}}
+                            # L^{(n)} = rowsum(e^{Sn-mn}) + L^{(n-1)} * e^{M^{(n-1)} - M^{(n)}}
+                            acc_QK_row_sum = acc_QK_row_sum + row_sum[r] * prev_minus_cur_exp
+                            # O^{(n-1)}' = O^{(n-1)} * e^{M^{(n-1)} - M^{(n)}}
+                            acc_PV_mn[r, None] = acc_PV_mn[r, None].load() * prev_minus_cur_exp
+
+                        row_max[r] = row_max_cur_row
+                        row_sum[r] = acc_QK_row_sum
+                        acc_QK_mn[r, None] = acc_QK_row_exp
+
+                # ///////////////////////////////////////////////////////////////////////////////
+                # p@V gemm calculation
+                # ///////////////////////////////////////////////////////////////////////////////
+                peak_v_full_status = cutlass.Boolean(1)
+                peak_v_full_status = mainloop_pipeline_V.consumer_try_wait(mainloop_consumer_read_state_V)
+
+                rP = cute.make_fragment_like(acc_QK, self.dtype)
+                # rP.store(acc_QK.load().to(self.dtype))
+                for i in cutlass.range_constexpr(cute.cosize(rP)):
+                    rP[i] = self.dtype(acc_QK[i])
+
+                tOrVt = thr_mma_PV.make_fragment_B(thr_mma_PV.partition_B(sVt))
+                tOrVt_copy_view = smem_thr_copy_V.retile(tOrVt)
+                tOsVt = smem_thr_copy_V.partition_S(sVt)
+
+                # convert rP from ((2, 2, 2*num_k_blocks_pv), 1, 1) to ((2, 2, 2), 1, 1, num_k_blocks_pv)
+                num_k_blocks_pv = cute.size(tOrVt, mode=[2])
+                rP_divided_dim3 = thr_mma_PV.partition_shape_A((self.tile_shape_mnk_PV[0], self.tile_shape_mnk_PV[2]))[0][2]
+
+                rP_layout_divided = cute.logical_divide(rP.layout, (None, None, rP_divided_dim3))
+                rP_mma_view = cute.make_layout(
+                    (
+                        (
+                            rP_layout_divided.shape[0][0],
+                            rP_layout_divided.shape[0][1],
+                            rP_layout_divided.shape[2][0],
+                        ),
+                        rP_layout_divided.shape[1],
+                        rP_layout_divided.shape[2][1],
+                    ),
+                    stride=(
+                        (
+                            rP_layout_divided.stride[0][0],
+                            rP_layout_divided.stride[0][1],
+                            rP_layout_divided.stride[2][0],
+                        ),
+                        rP_layout_divided.stride[1],
+                        rP_layout_divided.stride[2][1],
+                    ),
+                )
+
+                rP = cute.make_tensor(rP.iterator, rP_mma_view)
+
+                mainloop_pipeline_V.consumer_wait(mainloop_consumer_read_state_V, peak_v_full_status)
+                cute.copy(
+                    smem_tiled_copy_V,
+                    tOsVt[None, None, 0, mainloop_consumer_read_state_V.index],
+                    tOrVt_copy_view[None, None, 0, mainloop_consumer_read_state_V.index],
+                )
+                for k in cutlass.range_constexpr(0, cute.size(tOrVt, mode=[2])):
+                    if k < cute.size(tOrVt, mode=[2]) - 1:
+                        cute.copy(
+                            smem_tiled_copy_V,
+                            tOsVt[None, None, k + 1, mainloop_consumer_read_state_V.index],
+                            tOrVt_copy_view[None, None, k + 1, mainloop_consumer_read_state_V.index],
+                        )
+                    cute.gemm(
+                        tiled_mma_PV,
+                        acc_PV,
+                        rP[None, None, k],
+                        tOrVt[None, None, k, mainloop_consumer_read_state_V.index],
+                        acc_PV,
+                    )
+
+                peak_k_full_status = cutlass.Boolean(1)
+                if K_tile < K_tile_cnt - 1:
+                    peak_k_full_status = mainloop_pipeline.consumer_try_wait(mainloop_consumer_read_state_K)
+
+                mainloop_pipeline_V.consumer_release(mainloop_consumer_release_state_V)
+                mainloop_consumer_read_state_V.advance()
+                mainloop_consumer_release_state_V.advance()
+
+            # write row_max and row_sum to global memory
+            assert gL_thr.shape[0][1] == row_max.shape
+            for row_idx in cutlass.range(gL_thr.shape[0][1]):
+                if cute.elem_less(cLM_thr[(0, row_idx), 0, 0][0], self.GQA_group_size):
+                    gM_thr[(0, row_idx), 0, 0] = row_max[row_idx]
+                    gL_thr[(0, row_idx), 0, 0] = row_sum[row_idx]
+
+            # ********************
+            # epilogue
+            # ********************
+            # softmax normalization: O^{(n)} = O^{(n)} / L^{(n)}
+            for row_idx in cutlass.range(gL_thr.shape[0][1]):
+                if cute.elem_less(cLM_thr[(0, row_idx), 0, 0][0], self.GQA_group_size):
+                    acc_pv_mn_is_zero_or_nan = row_sum[row_idx] == 0.0 or row_sum[row_idx] != row_sum[row_idx]
+                    scale = 1.0 if acc_pv_mn_is_zero_or_nan else cute.arch.rcp_approx(row_sum[row_idx])
+                    acc_PV_mn[row_idx, None] = acc_PV_mn[row_idx, None].load() * scale
+
+            tOgO_for_tma_partition = cute.zipped_divide(
+                gO,
+                (self.epi_tile[0], self.epi_tile[1]),
+            )
+            self.copy_reg_to_gmem(
+                self.O_layout,
+                self.O_dtype,
+                tOgO_for_tma_partition,
+                tma_atom_O,
+                tiled_mma_PV,
+                acc_PV,
+                self.acc_dtype,
+                sO,
+                tidx,
+                warp_idx,
+            )
+        return
+
+    @cute.jit
+    def copy_reg_to_gmem(
+        self,
+        dest_layout,
+        dest_dtype,
+        gmem_tensor_partition,
+        tma_atom,
+        tiled_mma,
+        acc_tensor,
+        acc_dtype,
+        smem_tensor,
+        tidx,
+        warp_idx,
+    ):
+        cute.arch.fence_proxy(
+            cute.arch.ProxyKind.async_shared,
+            space=cute.arch.SharedSpace.shared_cta,
+        )
+        cute.arch.barrier()
+
+        copy_atom_r2s = sm90_utils.sm90_get_smem_store_op(
+            dest_layout,
+            elem_ty_d=dest_dtype,
+            elem_ty_acc=acc_dtype,  # useless in sm90_get_smem_store_op
+        )
+        copy_atom_C = cute.make_copy_atom(
+            cute.nvgpu.warp.StMatrix8x8x16bOp(
+                transpose=dest_layout.is_m_major_c(),
+                num_matrices=4,
+            ),
+            dest_dtype,
+        )
+        tiled_copy_atom = cute.make_tiled_copy_C_atom(copy_atom_C, tiled_mma)
+        tiled_copy_r2s = cute.make_tiled_copy_S(copy_atom_r2s, tiled_copy_atom)
+
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_dv_sD = thr_copy_r2s.partition_D(smem_tensor)
+        tRS_dv_rAcc = thr_copy_r2s.retile(acc_tensor)
+
+        # Allocate D registers.
+        rD_shape = cute.shape(thr_copy_r2s.partition_S(smem_tensor))
+        tRS_rD_layout = cute.make_layout(rD_shape[:3])
+        tRS_rD = cute.make_fragment_like(tRS_rD_layout, acc_dtype)
+        size_tRS_rD = cute.size(tRS_rD)
+
+        sepi_for_tma_partition = cute.group_modes(smem_tensor, 0, 2)
+
+        bSG_sD, bSG_gD = cute.nvgpu.cpasync.tma_partition(
+            tma_atom,
+            0,
+            cute.make_layout(1),
+            sepi_for_tma_partition,
+            gmem_tensor_partition,
+        )
+        epi_tile_num = cute.size(gmem_tensor_partition, mode=[1])
+        epi_tile_shape = gmem_tensor_partition.shape[1]
+
+        for epi_idx in cutlass.range(epi_tile_num, unroll=epi_tile_num):
+
+            for epi_v in cutlass.range_constexpr(size_tRS_rD, unroll_full=True):
+                tRS_rD[epi_v] = tRS_dv_rAcc[epi_idx * size_tRS_rD + epi_v]
+
+            tRS_rD_out = cute.make_fragment_like(tRS_rD_layout, dest_dtype)
+
+            # tRS_rD_out.store(tRS_rD.load().to(dest_dtype))
+            for i in cutlass.range_constexpr(cute.cosize(tRS_rD_out), unroll_full=True):
+                tRS_rD_out[i] = dest_dtype(tRS_rD[i])
+
+            # Copy from D registers to shared memory
+            epi_buffer = epi_idx % cute.size(tRS_dv_sD, mode=[3])
+            cute.copy(tiled_copy_r2s, tRS_rD_out, tRS_dv_sD[(None, None, None, epi_buffer)])
+
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            # barrier for sync
+            cute.arch.barrier()
+
+            epi_tile_layout = cute.make_layout(epi_tile_shape, stride=(epi_tile_shape[1], 1))
+            gmem_coord = epi_tile_layout.get_hier_coord(epi_idx)
+            # Copy from shared memory to global memory
+            if warp_idx == 0:
+                cute.copy(
+                    tma_atom,
+                    bSG_sD[(None, epi_buffer)],
+                    bSG_gD[(None, gmem_coord)],
+                )
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(self.epi_stage - 1, read=True)
+
+            cute.arch.barrier()
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/__init__.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/__init__.py
new file mode 100644
index 00000000..57672305
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/__init__.py
@@ -0,0 +1,6 @@
+from .api import SelectionAttention, selection_attention_wrapper
+
+__all__ = [
+    "SelectionAttention",
+    "selection_attention_wrapper",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/api.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/api.py
new file mode 100644
index 00000000..b4331098
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/selection/api.py
@@ -0,0 +1,494 @@
+from .NSA_select_attn_fwd_hmma import HopperSelectAttentionFwd
+from cudnn.datatypes import _convert_to_cutlass_data_type
+from cudnn.api_base import APIBase
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+from cuda.bindings import driver as cuda
+import torch
+from typing import Tuple, Optional
+import math
+
+
+class SelectionAttention(APIBase):
+    def __init__(
+        self,
+        sample_q: torch.Tensor,
+        sample_k: torch.Tensor,
+        sample_v: torch.Tensor,
+        sample_o: torch.Tensor,
+        sample_l: torch.Tensor,
+        sample_m: torch.Tensor,
+        sample_block_indices: torch.Tensor,
+        sample_block_counts: torch.Tensor,
+        sample_cum_seqlen_q: Optional[torch.Tensor] = None,
+        sample_cum_seqlen_k: Optional[torch.Tensor] = None,
+        max_s_q: Optional[int] = 1024,
+        max_s_k: Optional[int] = 1024,
+        acc_dtype: torch.dtype = torch.float32,
+        block_size: int = 64,
+        scale_softmax: Optional[float] = None,
+    ):
+        super().__init__()
+        self._kernel = HopperSelectAttentionFwd
+
+        self._logger.warning("SelectionAttention is an experimental API")
+        self._logger.debug("Entering __init__")
+
+        # Store sample tensors only; defer validation to check_support
+        self.sample_q = sample_q
+        self.sample_k = sample_k
+        self.sample_v = sample_v
+        self.sample_o = sample_o
+        self.sample_l = sample_l
+        self.sample_m = sample_m
+        self.sample_block_indices = sample_block_indices
+        self.sample_block_counts = sample_block_counts
+        self.sample_cum_seqlen_q = sample_cum_seqlen_q
+        self.sample_cum_seqlen_k = sample_cum_seqlen_k
+        self.max_s_q = max_s_q
+        self.max_s_k = max_s_k
+
+        # Types and kernel configuration
+        self.acc_dtype = acc_dtype
+        self.block_size = block_size
+
+        # Derived attributes (populated in check_support)
+        self.input_layout = None
+        self.dtype = None
+        self.h_q = None
+        self.h_kv = None
+        self.gqa_group_size = None
+        self.head_dim = None
+        self.value_dim = None
+
+        self.scale_softmax = scale_softmax
+
+        self._logger.debug(
+            f"__init__ completed with args: sample_q {sample_q.shape}, sample_k {sample_k.shape}, sample_v {sample_v.shape}, sample_o {sample_o.shape}, sample_l {sample_l.shape}, sample_m {sample_m.shape}, sample_block_indices {sample_block_indices.shape}, sample_block_counts {sample_block_counts.shape}, sample_cum_seqlen_q {sample_cum_seqlen_q.shape if sample_cum_seqlen_q is not None else 'None'}, sample_cum_seqlen_k {sample_cum_seqlen_k.shape if sample_cum_seqlen_k is not None else 'None'}, acc_dtype {acc_dtype}, max_s_q {max_s_q}, max_s_k {max_s_k}, block_size {block_size}, scale_softmax {scale_softmax}"
+        )
+
+    def check_support(self) -> bool:
+        self._logger.debug("Entering check_support")
+
+        # Shape normalization and validation
+        self._logger.debug("Checking shape normalization and validation")
+        if self.sample_q.ndim == 4:
+            # B, H_q, S, D  format
+            self.input_layout = "B,H,S,D"
+
+            raise NotImplementedError("B, H_q, S, D format not implemented")
+        elif self.sample_q.ndim == 3:
+            # T, H_q, D  format
+            self.input_layout = "T,H,D"
+
+            t, h_q, d_qk = self.sample_q.shape
+            t, h_kv, d_qk = self.sample_k.shape
+            t, h_kv, d_v = self.sample_v.shape
+            t, h_q, d_v = self.sample_o.shape
+
+            if self.sample_q.shape != (t, h_q, d_qk):
+                raise ValueError(f"Input shape mismatch: expected Q tensor shape {t, h_q, d_qk}, got {self.sample_q.shape}")
+            if self.sample_k.shape != (t, h_kv, d_qk):
+                raise ValueError(f"Input shape mismatch: expected K tensor shape {t, h_kv, d_qk}, got {self.sample_k.shape}")
+            if self.sample_v.shape != (t, h_kv, d_v):
+                raise ValueError(f"Input shape mismatch: expected V tensor shape {t, h_kv, d_v}, got {self.sample_v.shape}")
+            if self.sample_o.shape != (t, h_q, d_v):
+                raise ValueError(f"Output shape mismatch: expected O tensor shape {t, h_q, d_v}, got {self.sample_o.shape}")
+            self.sample_l = self._unpad_tensor_to_ndim(self.sample_l, 2, "sample_l")
+            if self.sample_l.shape != (t, h_q):
+                raise ValueError(f"Output shape mismatch: expected L tensor shape {t, h_q}, got {self.sample_l.shape}")
+            self.sample_m = self._unpad_tensor_to_ndim(self.sample_m, 2, "sample_m")
+            if self.sample_m.shape != (t, h_q):
+                raise ValueError(f"Output shape mismatch: expected M tensor shape {t, h_q}, got {self.sample_m.shape}")
+
+            if self.sample_cum_seqlen_q is None:
+                raise ValueError(f"sample_cum_seqlen_q must be provided for T,H,D format, got {self.sample_cum_seqlen_q}")
+            if self.sample_cum_seqlen_k is not None and not torch.equal(self.sample_cum_seqlen_q, self.sample_cum_seqlen_k):
+                raise NotImplementedError(
+                    f"SelectionAttention requires sample_cum_seqlen_q and sample_cum_seqlen_k to be identical, but got {self.sample_cum_seqlen_q} and {self.sample_cum_seqlen_k}"
+                )
+            if self.max_s_q is None:
+                raise ValueError(f"max_s_q must be provided for T,H,D format, got {self.max_s_q}")
+            if self.max_s_k is not None and self.max_s_q != self.max_s_k:
+                raise NotImplementedError(f"SelectionAttention requires max_s_q and max_s_k to be identical, but got {self.max_s_q} and {self.max_s_k}")
+
+            self.batch_size = len(self.sample_cum_seqlen_q) - 1
+            if self.batch_size <= 0:
+                raise ValueError(f"batch_size (len(sample_cum_seqlen_q) - 1) must be greater than 0, got {self.batch_size}")
+            if self.sample_cum_seqlen_q.dtype not in (torch.int32, torch.int64):
+                raise ValueError(f"sample_cum_seqlen_q must be int32 or int64, got {self.sample_cum_seqlen_q.dtype}")
+
+            if self.sample_block_indices.shape[:2] != (t, h_kv) and self.sample_block_indices.ndim != 3:
+                raise ValueError(f"sample_block_indices shape mismatch: expected {(t, h_kv, 'K')}, got {tuple(self.sample_block_indices.shape)}")
+            if self.sample_block_counts.shape != (t, h_kv):
+                raise ValueError(f"sample_block_counts shape mismatch: expected {(t, h_kv)}, got {tuple(self.sample_block_counts.shape)}")
+            if self.sample_block_indices.dtype != torch.int32 or self.sample_block_counts.dtype != torch.int32:
+                raise ValueError(
+                    f"sample_block_indices and sample_block_counts must be int32, got {self.sample_block_indices.dtype} and {self.sample_block_counts.dtype}"
+                )
+        else:
+            raise ValueError(f"sample_q must be rank-3 (T,H,D) or rank-4 (B,H,S,D), got {self.sample_q.ndim}")
+
+        # Shared derived attributes
+        if h_q % h_kv != 0:
+            raise ValueError("H_q must be a multiple of H_kv (GQA/MQA constraint)")
+        self.h_q = h_q
+        self.h_kv = h_kv
+        self.gqa_group_size = h_q // h_kv
+        self.head_dim = d_qk
+        self.value_dim = d_v
+
+        # Validate dtypes and config
+        self._logger.debug("Checking dtypes and config")
+        self.dtype = self.sample_q.dtype
+        if not (self.dtype == self.sample_k.dtype == self.sample_v.dtype == self.sample_o.dtype):
+            raise ValueError("All input/output tensors must have the same dtype")
+        if self.dtype not in {torch.float16, torch.bfloat16}:
+            raise ValueError("dtype must be Float16 or BFloat16")
+        if self.acc_dtype not in {torch.float32}:
+            raise ValueError("acc_dtype must be Float32")
+        if self.block_size not in {16, 32, 64}:
+            raise ValueError("block_size must be 16, 32, or 64")
+
+        # Compute default scale_softmax if needed
+        if self.scale_softmax is None:
+            self.scale_softmax = 1.0 / math.sqrt(self.head_dim)
+
+        if not torch.cuda.is_available():
+            self._logger.error("CUDA is not available")
+            raise RuntimeError("CUDA is not available")
+
+        self._logger.debug("Checking environment")
+        device = torch.cuda.current_device()
+        major, minor = torch.cuda.get_device_capability(device)
+        compute_capability = major * 10 + minor
+        if compute_capability < 90:
+            self._logger.error(f"Requires SM90+ compute capability, but found SM{compute_capability} on device {device}")
+            raise RuntimeError(f"Requires SM90+ compute capability, but found SM{compute_capability} on device {device}")
+        if compute_capability == 103:
+            raise RuntimeError("cuteDSL SelectionAttention is not supported on SM103")
+
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def _reshape_tensors(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        o: torch.Tensor,
+        l: torch.Tensor,
+        m: torch.Tensor,
+    ) -> Tuple[torch.Tensor, ...]:
+        """
+        Reshape tensors from input format to kernel expected format:
+        - Q: (gqa_group_size, d, T, h_kv)
+        - K: (T, d, h_kv)
+        - V: (T, d_v, h_kv)
+        - O: (gqa_group_size, d_v, T, h_kv)
+        - L: (gqa_group_size, T, h_kv)
+        - M: (gqa_group_size, T, h_kv)
+        """
+        if self.input_layout == "B,H,S,D":
+            raise NotImplementedError("B,H,S,D format not implemented")
+        elif self.input_layout == "T,H,D":
+            T, h_q, d = q.shape
+            _, h_kv, _ = k.shape
+            _, _, d_v = v.shape
+
+            # Reshape Q: (T, H_q, D) -> (gqa_group_size, D, T, H_kv)
+            q_reshaped = q.view(T, h_kv, self.gqa_group_size, d).permute(2, 3, 0, 1)
+            # Reshape K: (T, H_kv, D) -> (T, D, H_kv)
+            k_reshaped = k.permute(0, 2, 1)
+            # Reshape V: (T, H_kv, D_v) -> (T, D_v, H_kv)
+            v_reshaped = v.permute(0, 2, 1)
+            # Reshape O: (T, H_q, D_v) -> (gqa_group_size, D_v, T, H_kv)
+            o_reshaped = o.view(T, h_kv, self.gqa_group_size, d_v).permute(2, 3, 0, 1)
+            # Reshape L: (T, H_q) -> (gqa_group_size, T, H_kv)
+            l_reshaped = l.view(T, h_kv, self.gqa_group_size).permute(2, 0, 1)
+            # Reshape M: (T, H_q) -> (gqa_group_size, T, H_kv)
+            m_reshaped = m.view(T, h_kv, self.gqa_group_size).permute(2, 0, 1)
+        else:
+            raise ValueError(f"Invalid input layout: {self.input_layout}")
+
+        # Temporary: assert that no memory is copied during reshape
+        # Long term, we'd instead want to handle copying output tensors back to their original tensors
+        def shares_memory(original, reshaped):
+            return original.data_ptr() == reshaped.data_ptr()
+
+        if not shares_memory(q, q_reshaped):
+            raise ValueError("Q tensor memory changed during reshape - expected view operation")
+        if not shares_memory(k, k_reshaped):
+            raise ValueError("K tensor memory changed during reshape - expected view operation")
+        if not shares_memory(v, v_reshaped):
+            raise ValueError("V tensor memory changed during reshape - expected view operation")
+        if not shares_memory(o, o_reshaped):
+            raise ValueError("O tensor memory changed during reshape - expected view operation")
+        if not shares_memory(l, l_reshaped):
+            raise ValueError("L tensor memory changed during reshape - expected view operation")
+        if not shares_memory(m, m_reshaped):
+            raise ValueError("M tensor memory changed during reshape - expected view operation")
+
+        return q_reshaped, k_reshaped, v_reshaped, o_reshaped, l_reshaped, m_reshaped
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        self._logger.debug("Entering compile")
+        current_stream = self._get_default_stream(current_stream)
+        self._ensure_support_checked()
+
+        selection_attention = self._kernel(
+            head_dim=self.head_dim,
+            value_dim=self.value_dim,
+            GQA_group_size=self.gqa_group_size,
+            block_size=self.block_size,
+            dtype=_convert_to_cutlass_data_type(self.dtype),
+            acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+        )
+
+        self._logger.debug("Reshaping tensors to kernel expected format")
+        q_reshaped, k_reshaped, v_reshaped, o_reshaped, l_reshaped, m_reshaped = self._reshape_tensors(
+            self.sample_q,
+            self.sample_k,
+            self.sample_v,
+            self.sample_o,
+            self.sample_l,
+            self.sample_m,
+        )
+
+        mQ = from_dlpack(q_reshaped, assumed_align=128)
+        mK = from_dlpack(k_reshaped, assumed_align=128)
+        mV = from_dlpack(v_reshaped, assumed_align=128)
+        mO = from_dlpack(o_reshaped, assumed_align=128)
+        mL = from_dlpack(l_reshaped)
+        mM = from_dlpack(m_reshaped)
+        m_block_indices = from_dlpack(self.sample_block_indices)
+        m_block_counts = from_dlpack(self.sample_block_counts)
+        m_cum_seqlen_q = from_dlpack(self.sample_cum_seqlen_q)
+        # m_cum_seqlen_k = from_dlpack(self.sample_cum_seqlen_k) # unused
+
+        self._logger.debug("Compiling selection_attention")
+        self._compiled_kernel = cute.compile(
+            selection_attention,
+            Q=mQ,
+            K=mK,
+            V=mV,
+            O=mO,
+            L=mL,
+            M=mM,
+            block_indices=m_block_indices,
+            block_counts=m_block_counts,
+            max_length=self.max_s_q,
+            seq_offsets=m_cum_seqlen_q,
+            softmax_scale=self.scale_softmax,
+            stream=current_stream,
+        )
+        self._logger.debug("Kernel compiled successfully")
+
+    def execute(
+        self,
+        q_tensor: torch.Tensor,
+        k_tensor: torch.Tensor,
+        v_tensor: torch.Tensor,
+        o_tensor: torch.Tensor,
+        l_tensor: torch.Tensor,
+        m_tensor: torch.Tensor,
+        block_indices_tensor: torch.Tensor,
+        block_counts_tensor: torch.Tensor,
+        cum_seqlen_q_tensor: Optional[torch.Tensor] = None,
+        cum_seqlen_k_tensor: Optional[torch.Tensor] = None,
+        scale_softmax: Optional[float] = None,
+        current_stream: Optional[cuda.CUstream] = None,
+        skip_compile: bool = False,
+    ):
+        self._logger.debug("Entering execute")
+        current_stream = self._get_default_stream(current_stream)
+
+        self._logger.debug("Reshaping tensors to kernel expected format")
+        l_tensor = self._unpad_tensor_to_ndim(l_tensor, 2, "l_tensor")
+        m_tensor = self._unpad_tensor_to_ndim(m_tensor, 2, "m_tensor")
+        q_reshaped, k_reshaped, v_reshaped, o_reshaped, l_reshaped, m_reshaped = self._reshape_tensors(
+            q_tensor, k_tensor, v_tensor, o_tensor, l_tensor, m_tensor
+        )
+
+        mQ = from_dlpack(q_reshaped, assumed_align=128)
+        mK = from_dlpack(k_reshaped, assumed_align=128)
+        mV = from_dlpack(v_reshaped, assumed_align=128)
+        mO = from_dlpack(o_reshaped, assumed_align=128)
+        mL = from_dlpack(l_reshaped)
+        mM = from_dlpack(m_reshaped)
+        m_block_indices = from_dlpack(block_indices_tensor)
+        m_block_counts = from_dlpack(block_counts_tensor)
+        m_cum_seqlen_q = from_dlpack(cum_seqlen_q_tensor)
+        # m_cum_seqlen_k = from_dlpack(cum_seqlen_k_tensor) # unused
+
+        scale_softmax = self.scale_softmax if scale_softmax is None else scale_softmax
+
+        if not skip_compile:
+            if self._compiled_kernel is None:
+                raise RuntimeError("SelectionAttention kernel not compiled")
+            self._logger.debug("Executing with compiled kernel")
+            self._compiled_kernel(
+                Q=mQ,
+                K=mK,
+                V=mV,
+                O=mO,
+                L=mL,
+                M=mM,
+                block_indices=m_block_indices,
+                block_counts=m_block_counts,
+                max_length=self.max_s_q,
+                seq_offsets=m_cum_seqlen_q,
+                softmax_scale=scale_softmax,
+                stream=current_stream,
+            )
+            self._logger.debug("Executed with compiled kernel successfully")
+        else:
+            self._logger.debug("Executing without compiled kernel (JIT)")
+            selection_attention = self._kernel(
+                head_dim=self.head_dim,
+                value_dim=self.value_dim,
+                GQA_group_size=self.gqa_group_size,
+                block_size=self.block_size,
+                dtype=_convert_to_cutlass_data_type(self.dtype),
+                acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+            )
+            selection_attention(
+                Q=mQ,
+                K=mK,
+                V=mV,
+                O=mO,
+                L=mL,
+                M=mM,
+                block_indices=m_block_indices,
+                block_counts=m_block_counts,
+                max_length=self.max_s_q,
+                seq_offsets=m_cum_seqlen_q,
+                softmax_scale=scale_softmax,
+                stream=current_stream,
+            )
+            self._logger.debug("Executed successfully")
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_SelectionAttentionObjects = {}
+
+
+def selection_attention_wrapper(
+    q_tensor: torch.Tensor,
+    k_tensor: torch.Tensor,
+    v_tensor: torch.Tensor,
+    block_indices_tensor: torch.Tensor,
+    block_counts_tensor: torch.Tensor,
+    cum_seqlen_q_tensor: Optional[torch.Tensor] = None,
+    cum_seqlen_k_tensor: Optional[torch.Tensor] = None,
+    block_size: int = 64,
+    scale_softmax: Optional[float] = None,
+    o_dtype: Optional[torch.dtype] = None,
+    acc_dtype: torch.dtype = torch.float32,
+    max_s_q: Optional[int] = None,
+    max_s_k: Optional[int] = None,
+    stream: Optional[cuda.CUstream] = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Selection Attention Wrapper that returns output tensors directly.
+
+    Returns:
+        tuple: (o_tensor, l_tensor, m_tensor) - Output, logsumexp, and max tensors
+    """
+    _logger.debug("selection_attention_wrapper: Creating empty output tensors o, l, and m")
+
+    max_s_q = max(cum_seqlen_q_tensor[1:] - cum_seqlen_q_tensor[:-1]).item() if max_s_q is None else max_s_q
+    max_s_k = max(cum_seqlen_k_tensor[1:] - cum_seqlen_k_tensor[:-1]).item() if max_s_k is None else max_s_k
+
+    t, h_q, d = q_tensor.shape
+    _, h_kv, d_v = v_tensor.shape
+
+    o_dtype = o_dtype if o_dtype is not None else q_tensor.dtype
+    o_tensor = torch.empty((t, h_q, d_v), dtype=o_dtype, device=q_tensor.device)
+    l_tensor = torch.empty((t, h_q, 1), dtype=torch.float32, device=q_tensor.device)
+    m_tensor = torch.empty((t, h_q, 1), dtype=torch.float32, device=q_tensor.device)
+
+    cache_key = (
+        q_tensor.shape,
+        k_tensor.shape,
+        v_tensor.shape,
+        block_indices_tensor.shape,
+        block_counts_tensor.shape,
+        cum_seqlen_q_tensor.shape,
+        cum_seqlen_k_tensor.shape,
+        q_tensor.dtype,
+        k_tensor.dtype,
+        v_tensor.dtype,
+        q_tensor.stride(),
+        k_tensor.stride(),
+        v_tensor.stride(),
+        block_indices_tensor.stride(),
+        block_counts_tensor.stride(),
+        cum_seqlen_q_tensor.stride(),
+        cum_seqlen_k_tensor.stride(),
+        block_size,
+        scale_softmax,
+        acc_dtype,
+        max_s_q,
+        max_s_k,
+    )
+    if cache_key in _cache_of_SelectionAttentionObjects:
+        _logger.debug("selection_attention_wrapper: Using previously cached SelectionAttention object")
+        selection_attention = _cache_of_SelectionAttentionObjects[cache_key]
+        selection_attention.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            v_tensor=v_tensor,
+            o_tensor=o_tensor,
+            l_tensor=l_tensor,
+            m_tensor=m_tensor,
+            block_indices_tensor=block_indices_tensor,
+            block_counts_tensor=block_counts_tensor,
+            cum_seqlen_q_tensor=cum_seqlen_q_tensor,
+            cum_seqlen_k_tensor=cum_seqlen_k_tensor,
+            scale_softmax=scale_softmax,
+            current_stream=stream,
+        )
+    else:
+        _logger.debug("selection_attention_wrapper: No previously cached SelectionAttention object found, creating new SelectionAttention object")
+        selection_attention = SelectionAttention(
+            sample_q=q_tensor,
+            sample_k=k_tensor,
+            sample_v=v_tensor,
+            sample_o=o_tensor,
+            sample_l=l_tensor,
+            sample_m=m_tensor,
+            sample_block_indices=block_indices_tensor,
+            sample_block_counts=block_counts_tensor,
+            sample_cum_seqlen_q=cum_seqlen_q_tensor,
+            sample_cum_seqlen_k=cum_seqlen_k_tensor,
+            acc_dtype=acc_dtype,
+            max_s_q=max_s_q,
+            max_s_k=max_s_k,
+            block_size=block_size,
+            scale_softmax=scale_softmax,
+        )
+        assert selection_attention.check_support()
+        selection_attention.compile()
+        selection_attention.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            v_tensor=v_tensor,
+            o_tensor=o_tensor,
+            l_tensor=l_tensor,
+            m_tensor=m_tensor,
+            block_indices_tensor=block_indices_tensor,
+            block_counts_tensor=block_counts_tensor,
+            cum_seqlen_q_tensor=cum_seqlen_q_tensor,
+            cum_seqlen_k_tensor=cum_seqlen_k_tensor,
+            scale_softmax=scale_softmax,
+            current_stream=stream,
+        )
+        _cache_of_SelectionAttentionObjects[cache_key] = selection_attention
+
+    return o_tensor, l_tensor, m_tensor
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sliding_window_attention/__init__.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sliding_window_attention/__init__.py
new file mode 100644
index 00000000..7c0dc8fc
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sliding_window_attention/__init__.py
@@ -0,0 +1,6 @@
+from .api import SlidingWindowAttention, sliding_window_attention_wrapper
+
+__all__ = [
+    "SlidingWindowAttention",
+    "sliding_window_attention_wrapper",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sliding_window_attention/api.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sliding_window_attention/api.py
new file mode 100644
index 00000000..16b4d652
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sliding_window_attention/api.py
@@ -0,0 +1,628 @@
+from __future__ import annotations
+import math
+import cudnn
+import torch
+
+from cuda.bindings import driver as cuda
+from cudnn.datatypes import _torch_to_cudnn_data_type
+from cudnn.api_base import APIBase
+from typing import Optional
+
+from ..utils import make_tensor_strided_like
+
+
+class SlidingWindowAttention(APIBase):
+    def __init__(
+        self,
+        sample_q: torch.Tensor,
+        sample_k: torch.Tensor,
+        sample_v: torch.Tensor,
+        sample_o: torch.Tensor,
+        sample_stats: Optional[torch.Tensor] = None,
+        left_bound: int = 0,
+        right_bound: int = 0,
+        sample_seq_len_q: Optional[torch.Tensor] = None,
+        sample_seq_len_kv: Optional[torch.Tensor] = None,
+        sample_q_ragged_offset: Optional[torch.Tensor] = None,
+        sample_k_ragged_offset: Optional[torch.Tensor] = None,
+        sample_v_ragged_offset: Optional[torch.Tensor] = None,
+        sample_o_ragged_offset: Optional[torch.Tensor] = None,
+        sample_stats_ragged_offset: Optional[torch.Tensor] = None,
+        max_seq_len_q: Optional[int] = None,
+        max_seq_len_kv: Optional[int] = None,
+        attn_scale: Optional[float] = None,
+        intermediate_data_type: torch.dtype = torch.float32,
+        compute_data_type: torch.dtype = torch.float32,
+        cudnn_handle: Optional[cudnn.handle] = None,
+    ):
+        super().__init__()
+        self._logger.debug("Entering __init__")
+
+        self.sample_q = sample_q
+        self.sample_k = sample_k
+        self.sample_v = sample_v
+        self.sample_o = sample_o
+        self.is_infer = sample_stats is None
+        self.sample_stats = self._pad_tensor_to_ndim(sample_stats, self.sample_o.ndim, "sample_stats") if sample_stats is not None else None
+        self.left_bound = left_bound
+        self.right_bound = right_bound
+
+        self.sample_seq_len_q = self._pad_tensor_to_ndim(sample_seq_len_q, 4, "sample_seq_len_q")
+        self.sample_seq_len_kv = self._pad_tensor_to_ndim(sample_seq_len_kv, 4, "sample_seq_len_kv")
+        self.max_seq_len_q = max_seq_len_q
+        self.max_seq_len_kv = max_seq_len_kv
+        self.sample_q_ragged_offset = self._pad_tensor_to_ndim(sample_q_ragged_offset, 4, "sample_q_ragged_offset")
+        self.sample_k_ragged_offset = self._pad_tensor_to_ndim(sample_k_ragged_offset, 4, "sample_k_ragged_offset")
+        self.sample_v_ragged_offset = self._pad_tensor_to_ndim(sample_v_ragged_offset, 4, "sample_v_ragged_offset")
+        self.sample_o_ragged_offset = self._pad_tensor_to_ndim(sample_o_ragged_offset, 4, "sample_o_ragged_offset")
+        self.sample_stats_ragged_offset = (
+            self._pad_tensor_to_ndim(sample_stats_ragged_offset, 4, "sample_stats_ragged_offset") if sample_stats_ragged_offset is not None else None
+        )
+
+        self.attn_scale = attn_scale if attn_scale is not None else 1.0 / math.sqrt(self.sample_q.shape[-1])
+        self.intermediate_data_type = intermediate_data_type
+        self.compute_data_type = compute_data_type
+
+        self.dtype = None
+        self.sm_version = None
+        self.input_layout = None
+
+        if cudnn_handle is None:
+            self._logger.critical(
+                "cudnn_handle not provided, creating new handle. This is not recommended as this is significant overhead and will occur for each SlidingWindowAttention object created."
+            )
+        self._cudnn_handle = cudnn_handle if cudnn_handle is not None else cudnn.create_handle()
+        self._cudnn_swa_graph = None
+        self._cudnn_compiled = False
+        self._logger.debug(
+            f"__init__ completed with args: sample_q {tuple(sample_q.shape)}, sample_k {tuple(sample_k.shape)}, sample_v {tuple(sample_v.shape)}, sample_o {tuple(sample_o.shape)}, sample_stats {None if sample_stats is None else tuple(sample_stats.shape)}, left_bound {left_bound}, right_bound {right_bound}, sample_seq_len_q {None if sample_seq_len_q is None else tuple(sample_seq_len_q.shape)}, sample_seq_len_kv {None if sample_seq_len_kv is None else tuple(sample_seq_len_kv.shape)}, max_seq_len_q {max_seq_len_q}, max_seq_len_kv {max_seq_len_kv}, is_infer {self.is_infer}, attn_scale {attn_scale}, intermediate_data_type {intermediate_data_type}, compute_data_type {compute_data_type}"
+        )
+
+    def _calculate_ragged_offsets(
+        self,
+        seq_len_q,
+        seq_len_kv,
+        sample_q,
+        sample_k,
+        sample_v,
+        sample_o,
+        sample_stats,
+    ):
+        """Calculate ragged offsets for fully packed THD layout."""
+
+        def compute_exclusive_prefix_sum(tensor):
+            assert tensor.shape[1:] == (
+                1,
+                1,
+                1,
+            ), f"Expected shape (b,1,1,1), got {tensor.shape}"
+            return torch.cat(
+                (
+                    torch.zeros((1, 1, 1, 1), dtype=tensor.dtype, device=tensor.device),
+                    torch.cumsum(tensor, dim=0),
+                )
+            )
+
+        # Calculate ragged offsets
+        q_ragged_offset = (compute_exclusive_prefix_sum(seq_len_q) * self.sample_q.stride()[0]).to(dtype=torch.int64)
+        k_ragged_offset = (compute_exclusive_prefix_sum(seq_len_kv) * self.sample_k.stride()[0]).to(dtype=torch.int64)
+        v_ragged_offset = (compute_exclusive_prefix_sum(seq_len_kv) * self.sample_v.stride()[0]).to(dtype=torch.int64)
+        o_ragged_offset = (compute_exclusive_prefix_sum(seq_len_q) * self.sample_o.stride()[0]).to(dtype=torch.int64)
+        stats_ragged_offset = (compute_exclusive_prefix_sum(seq_len_q) * self.sample_stats.stride()[0]).to(dtype=torch.int64) if not self.is_infer else None
+
+        return (
+            q_ragged_offset,
+            k_ragged_offset,
+            v_ragged_offset,
+            o_ragged_offset,
+            stats_ragged_offset,
+        )
+
+    def check_support(self) -> bool:
+        self._logger.debug("Entering check_support")
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        self.dtype = self.sample_q.dtype
+        self.sm_version = torch.cuda.get_device_capability()[0] * 10 + torch.cuda.get_device_capability()[1]
+        if self.sample_q.ndim == 4:
+            self._logger.debug("Inferred bshd layout")
+            self.input_layout = "bshd"
+        elif self.sample_q.ndim == 3:
+            self._logger.debug("Inferred thd layout")
+            self.input_layout = "thd"
+        else:
+            raise ValueError(f"Invalid input layout: {self.sample_q.ndim}")
+
+        swa_graph = cudnn.pygraph(
+            io_data_type=_torch_to_cudnn_data_type(self.dtype),
+            intermediate_data_type=_torch_to_cudnn_data_type(self.intermediate_data_type),
+            compute_data_type=_torch_to_cudnn_data_type(self.compute_data_type),
+            handle=self._cudnn_handle,
+            sm_version=self.sm_version,
+        )
+
+        (
+            self.q_cudnn,
+            self.k_cudnn,
+            self.v_cudnn,
+            self.seq_len_q_cudnn,
+            self.seq_len_kv_cudnn,
+            self.q_ragged_offset_cudnn,
+            self.k_ragged_offset_cudnn,
+            self.v_ragged_offset_cudnn,
+            self.o_ragged_offset_cudnn,
+            self.stats_ragged_offset_cudnn,
+        ) = (None, None, None, None, None, None, None, None, None, None)
+        if self.input_layout == "bshd":
+            b, h_q, s_q, d_qk = self.sample_q.shape
+            b, h_kv, s_kv, d_qk = self.sample_k.shape
+            b, h_kv, s_kv, d_v = self.sample_v.shape
+            b, h_q, s_q, d_v = self.sample_o.shape
+
+            if self.sample_q.shape != (b, h_q, s_q, d_qk):
+                raise ValueError(f"Input shape mismatch: expected Q tensor shape {b, h_q, s_q, d_qk}, got {self.sample_q.shape}")
+            if self.sample_k.shape != (b, h_kv, s_kv, d_qk):
+                raise ValueError(f"Input shape mismatch: expected K tensor shape {b, h_kv, s_kv, d_qk}, got {self.sample_k.shape}")
+            if self.sample_v.shape != (b, h_kv, s_kv, d_v):
+                raise ValueError(f"Input shape mismatch: expected V tensor shape {b, h_kv, s_kv, d_v}, got {self.sample_v.shape}")
+            if self.sample_o.shape != (b, h_q, s_q, d_v):
+                raise ValueError(f"Output shape mismatch: expected O tensor shape {b, h_q, s_q, d_v}, got {self.sample_o.shape}")
+            if not self.is_infer:
+                self.sample_stats = self._pad_tensor_to_ndim(self.sample_stats, 4, "sample_stats")
+                if self.sample_stats.shape != (b, h_q, s_q, 1):
+                    raise ValueError(f"Output shape mismatch: expected Stats tensor shape {b, h_q, s_q, 1}, got {self.sample_stats.shape}")
+            if self.sample_seq_len_q is not None or self.sample_seq_len_kv is not None:
+                raise ValueError(
+                    f"sample_seq_len_q and sample_seq_len_kv should be None for bshd layout, got {self.sample_seq_len_q} and {self.sample_seq_len_kv}"
+                )
+            if self.max_seq_len_q is not None or self.max_seq_len_kv is not None:
+                raise ValueError(f"max_seq_len_q and max_seq_len_kv should be None for bshd layout, got {self.max_seq_len_q} and {self.max_seq_len_kv}")
+            if (
+                self.sample_q_ragged_offset is not None
+                or self.sample_k_ragged_offset is not None
+                or self.sample_v_ragged_offset is not None
+                or self.sample_o_ragged_offset is not None
+                or self.sample_stats_ragged_offset is not None
+            ):
+                raise ValueError(
+                    f"sample_q_ragged_offset, sample_k_ragged_offset, sample_v_ragged_offset, sample_o_ragged_offset, and sample_stats_ragged_offset should be None for bshd layout, got {self.sample_q_ragged_offset}, {self.sample_k_ragged_offset}, {self.sample_v_ragged_offset}, {self.sample_o_ragged_offset}, and {self.sample_stats_ragged_offset}"
+                )
+
+            self.q_cudnn = swa_graph.tensor_like(self.sample_q)
+            self.k_cudnn = swa_graph.tensor_like(self.sample_k)
+            self.v_cudnn = swa_graph.tensor_like(self.sample_v)
+        elif self.input_layout == "thd":
+            t, h_q, d_qk = self.sample_q.shape
+            t, h_kv, d_qk = self.sample_k.shape
+            t, h_kv, d_v = self.sample_v.shape
+            t, h_q, d_v = self.sample_o.shape
+
+            if self.sample_q.shape != (t, h_q, d_qk):
+                raise ValueError(f"Input shape mismatch: expected Q tensor shape {t, h_q, d_qk}, got {self.sample_q.shape}")
+            if self.sample_k.shape != (t, h_kv, d_qk):
+                raise ValueError(f"Input shape mismatch: expected K tensor shape {t, h_kv, d_qk}, got {self.sample_k.shape}")
+            if self.sample_v.shape != (t, h_kv, d_v):
+                raise ValueError(f"Input shape mismatch: expected V tensor shape {t, h_kv, d_v}, got {self.sample_v.shape}")
+            if self.sample_o.shape != (t, h_q, d_v):
+                raise ValueError(f"Output shape mismatch: expected O tensor shape {t, h_q, d_v}, got {self.sample_o.shape}")
+            if not self.is_infer:
+                self.sample_stats = self._pad_tensor_to_ndim(self.sample_stats, 3, "sample_stats")
+                if self.sample_stats.shape != (t, h_q, 1):
+                    raise ValueError(f"Output shape mismatch: expected Stats tensor shape {t, h_q, 1}, got {self.sample_stats.shape}")
+
+            if self.sample_seq_len_q is None or self.sample_seq_len_kv is None:
+                raise ValueError(
+                    f"sample_seq_len_q and sample_seq_len_kv must be provided for thd layout, got {self.sample_seq_len_q} and {self.sample_seq_len_kv}"
+                )
+            if self.max_seq_len_q is None or self.max_seq_len_kv is None:
+                raise ValueError(f"max_seq_len_q and max_seq_len_kv must be provided for thd layout, got {self.max_seq_len_q} and {self.max_seq_len_kv}")
+
+            if (
+                self.sample_q_ragged_offset is None
+                or self.sample_k_ragged_offset is None
+                or self.sample_v_ragged_offset is None
+                or self.sample_o_ragged_offset is None
+                or (self.sample_stats_ragged_offset is None and not self.is_infer)
+            ):
+                if (
+                    self.sample_q_ragged_offset is not None
+                    or self.sample_k_ragged_offset is not None
+                    or self.sample_v_ragged_offset is not None
+                    or self.sample_o_ragged_offset is not None
+                    or (not self.is_infer and self.sample_stats_ragged_offset is not None)
+                ):
+                    raise ValueError(
+                        f"sample_q_ragged_offset, sample_k_ragged_offset, sample_v_ragged_offset, sample_o_ragged_offset, and sample_stats_ragged_offset must be all provided or all None, got {self.sample_q_ragged_offset}, {self.sample_k_ragged_offset}, {self.sample_v_ragged_offset}, {self.sample_o_ragged_offset}, and {self.sample_stats_ragged_offset}"
+                    )
+                self._logger.info("Calculating ragged offsets internally assuming fully packed THD layout")
+                (
+                    self.sample_q_ragged_offset,
+                    self.sample_k_ragged_offset,
+                    self.sample_v_ragged_offset,
+                    self.sample_o_ragged_offset,
+                    self.sample_stats_ragged_offset,
+                ) = self._calculate_ragged_offsets(
+                    self.sample_seq_len_q,
+                    self.sample_seq_len_kv,
+                    self.sample_q,
+                    self.sample_k,
+                    self.sample_v,
+                    self.sample_o,
+                    self.sample_stats,
+                )
+
+            b = len(self.sample_seq_len_q)
+            self.q_cudnn = swa_graph.tensor(
+                dim=(b, h_q, self.max_seq_len_q, d_qk),
+                stride=(
+                    self.sample_q.stride()[0] * self.max_seq_len_q,
+                    self.sample_q.stride()[1],
+                    self.sample_q.stride()[0],
+                    self.sample_q.stride()[2],
+                ),
+            )
+            self.k_cudnn = swa_graph.tensor(
+                dim=(b, h_kv, self.max_seq_len_kv, d_qk),
+                stride=(
+                    self.sample_k.stride()[0] * self.max_seq_len_kv,
+                    self.sample_k.stride()[1],
+                    self.sample_k.stride()[0],
+                    self.sample_k.stride()[2],
+                ),
+            )
+            self.v_cudnn = swa_graph.tensor(
+                dim=(b, h_kv, self.max_seq_len_kv, d_v),
+                stride=(
+                    self.sample_v.stride()[0] * self.max_seq_len_kv,
+                    self.sample_v.stride()[1],
+                    self.sample_v.stride()[0],
+                    self.sample_v.stride()[2],
+                ),
+            )
+            self.seq_len_q_cudnn = swa_graph.tensor_like(self.sample_seq_len_q)
+            self.seq_len_kv_cudnn = swa_graph.tensor_like(self.sample_seq_len_kv)
+            self.q_ragged_offset_cudnn = swa_graph.tensor_like(self.sample_q_ragged_offset)
+            self.k_ragged_offset_cudnn = swa_graph.tensor_like(self.sample_k_ragged_offset)
+            self.v_ragged_offset_cudnn = swa_graph.tensor_like(self.sample_v_ragged_offset)
+            self.o_ragged_offset_cudnn = swa_graph.tensor_like(self.sample_o_ragged_offset)
+            if not self.is_infer:
+                self.stats_ragged_offset_cudnn = swa_graph.tensor_like(self.sample_stats_ragged_offset)
+
+            self.q_cudnn.set_ragged_offset(self.q_ragged_offset_cudnn)
+            self.k_cudnn.set_ragged_offset(self.k_ragged_offset_cudnn)
+            self.v_cudnn.set_ragged_offset(self.v_ragged_offset_cudnn)
+
+        self.o_cudnn, self.stats_cudnn = swa_graph.sdpa(
+            name="sdpa",
+            q=self.q_cudnn,
+            k=self.k_cudnn,
+            v=self.v_cudnn,
+            generate_stats=not self.is_infer,
+            attn_scale=self.attn_scale,
+            bias=None,
+            use_alibi_mask=False,
+            use_padding_mask=(self.input_layout == "thd"),
+            seq_len_q=self.seq_len_q_cudnn,
+            seq_len_kv=self.seq_len_kv_cudnn,
+            diagonal_band_left_bound=self.left_bound,
+            diagonal_band_right_bound=self.right_bound,
+            diagonal_alignment=cudnn.diagonal_alignment.TOP_LEFT,
+            dropout=None,
+            rng_dump=None,
+            paged_attention_k_table=None,
+            paged_attention_v_table=None,
+            paged_attention_max_seq_len_kv=None,
+        )
+        self.o_cudnn.set_output(True)
+
+        if self.input_layout == "bshd":
+            self.o_cudnn.set_dim(self.sample_o.shape).set_stride(self.sample_o.stride())
+            if not self.is_infer:
+                self.stats_cudnn.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+                self.stats_cudnn.set_dim(self.sample_stats.shape).set_stride(self.sample_stats.stride())
+        elif self.input_layout == "thd":
+            self.o_cudnn.set_dim((b, h_q, self.max_seq_len_q, d_v))
+            self.o_cudnn.set_stride(
+                (
+                    self.sample_o.stride()[0] * self.max_seq_len_q,
+                    self.sample_o.stride()[1],
+                    self.sample_o.stride()[0],
+                    self.sample_o.stride()[2],
+                )
+            )
+            self.o_cudnn.set_ragged_offset(self.o_ragged_offset_cudnn)
+
+            if not self.is_infer:
+                self.stats_cudnn.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+                self.stats_cudnn.set_dim((b, h_q, self.max_seq_len_q, 1))
+                self.stats_cudnn.set_stride(
+                    (
+                        self.sample_stats.stride()[0] * self.max_seq_len_q,
+                        self.sample_stats.stride()[1],
+                        self.sample_stats.stride()[0],
+                        self.sample_stats.stride()[2],
+                    )
+                )
+                self.stats_cudnn.set_ragged_offset(self.stats_ragged_offset_cudnn)
+
+        try:
+            swa_graph.validate()
+        except cudnn.cudnnGraphNotSupportedError as e:
+            self._logger.error(f"Graph not supported (cudnnGraphNotSupportedError): {e}")
+            return False
+        except Exception as e:
+            self._logger.error(f"Graph not supported: {e}")
+            return False
+
+        self._cudnn_swa_graph = swa_graph
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        if current_stream is not None:
+            self._logger.warning("Overwriting cudnn_handle stream with provided cuda stream. Do not pass in current_stream if this is not intended.")
+            cudnn.set_stream(self._cudnn_handle, current_stream)
+        self._ensure_support_checked()
+
+        self._cudnn_swa_graph.build_operation_graph()
+        self._cudnn_swa_graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        self._cudnn_swa_graph.check_support()
+        self._cudnn_swa_graph.build_plans()
+
+        self._cudnn_compiled = True
+        self._logger.debug("SlidingWindowAttention kernel compiled successfully")
+
+    def execute(
+        self,
+        q_tensor: torch.Tensor,
+        k_tensor: torch.Tensor,
+        v_tensor: torch.Tensor,
+        o_tensor: torch.Tensor,
+        stats_tensor: Optional[torch.Tensor] = None,
+        seq_len_q_tensor: Optional[torch.Tensor] = None,
+        seq_len_kv_tensor: Optional[torch.Tensor] = None,
+        q_ragged_offset_tensor: Optional[torch.Tensor] = None,
+        k_ragged_offset_tensor: Optional[torch.Tensor] = None,
+        v_ragged_offset_tensor: Optional[torch.Tensor] = None,
+        o_ragged_offset_tensor: Optional[torch.Tensor] = None,
+        stats_ragged_offset_tensor: Optional[torch.Tensor] = None,
+        current_stream: Optional[cuda.CUstream] = None,
+        cudnn_handle: Optional[cudnn.handle] = None,
+        skip_compile: bool = False,
+    ) -> None:
+        self._logger.debug("Entering execute")
+        cudnn_handle = self._cudnn_handle if cudnn_handle is None else cudnn_handle
+        if current_stream is not None:
+            self._logger.info("Overwriting cudnn_handle stream with provided cuda stream. Do not pass in current_stream if this is not intended.")
+            cudnn.set_stream(cudnn_handle, current_stream)
+
+        if skip_compile:
+            raise NotImplementedError("cudnn sliding window attention kernel does not support skip_compile")
+        if self._cudnn_swa_graph is None or not self._cudnn_compiled:
+            raise ValueError("SlidingWindowAttention kernel not compiled")
+        self._logger.debug("Executing with compiled kernel")
+
+        self._logger.debug("Reshaping tensors to kernel expected format")
+        stats_tensor = self._pad_tensor_to_ndim(stats_tensor, self.sample_o.ndim, "stats_tensor") if stats_tensor is not None else None
+        seq_len_q_tensor = self._pad_tensor_to_ndim(seq_len_q_tensor, 4, "seq_len_q_tensor")
+        seq_len_kv_tensor = self._pad_tensor_to_ndim(seq_len_kv_tensor, 4, "seq_len_kv_tensor")
+        q_ragged_offset_tensor = self._pad_tensor_to_ndim(q_ragged_offset_tensor, 4, "q_ragged_offset_tensor")
+        k_ragged_offset_tensor = self._pad_tensor_to_ndim(k_ragged_offset_tensor, 4, "k_ragged_offset_tensor")
+        v_ragged_offset_tensor = self._pad_tensor_to_ndim(v_ragged_offset_tensor, 4, "v_ragged_offset_tensor")
+        o_ragged_offset_tensor = self._pad_tensor_to_ndim(o_ragged_offset_tensor, 4, "o_ragged_offset_tensor")
+        stats_ragged_offset_tensor = self._pad_tensor_to_ndim(stats_ragged_offset_tensor, 4, "stats_ragged_offset_tensor")
+
+        if not self.is_infer and stats_tensor is None:
+            raise ValueError(f"stats_tensor must be provided when compiled in non-inference mode, got {stats_tensor}")
+
+        if self.input_layout == "thd":
+            if seq_len_q_tensor is None or seq_len_kv_tensor is None:
+                raise ValueError(f"seq_len_q_tensor and seq_len_kv_tensor must be provided for thd layout, got {seq_len_q_tensor} and {seq_len_kv_tensor}")
+            if (
+                q_ragged_offset_tensor is None
+                or k_ragged_offset_tensor is None
+                or v_ragged_offset_tensor is None
+                or o_ragged_offset_tensor is None
+                or (stats_ragged_offset_tensor is None and not self.is_infer)
+            ):
+                if (
+                    q_ragged_offset_tensor is not None
+                    or k_ragged_offset_tensor is not None
+                    or v_ragged_offset_tensor is not None
+                    or o_ragged_offset_tensor is not None
+                    or (not self.is_infer and stats_ragged_offset_tensor is not None)
+                ):
+                    raise ValueError(
+                        f"q_ragged_offset_tensor, k_ragged_offset_tensor, v_ragged_offset_tensor, o_ragged_offset_tensor, and stats_ragged_offset_tensor must be all provided or all None, got {q_ragged_offset_tensor}, {k_ragged_offset_tensor}, {v_ragged_offset_tensor}, {o_ragged_offset_tensor}, and {stats_ragged_offset_tensor}"
+                    )
+                self._logger.info("Calculating ragged offsets internally assuming fully packed THD layout")
+                (
+                    q_ragged_offset_tensor,
+                    k_ragged_offset_tensor,
+                    v_ragged_offset_tensor,
+                    o_ragged_offset_tensor,
+                    stats_ragged_offset_tensor,
+                ) = self._calculate_ragged_offsets(
+                    seq_len_q_tensor,
+                    seq_len_kv_tensor,
+                    self.sample_q,
+                    self.sample_k,
+                    self.sample_v,
+                    self.sample_o,
+                    self.sample_stats,
+                )
+
+        variant_pack = {
+            self.q_cudnn: q_tensor,
+            self.k_cudnn: k_tensor,
+            self.v_cudnn: v_tensor,
+            self.o_cudnn: o_tensor,
+            self.seq_len_q_cudnn: seq_len_q_tensor,
+            self.seq_len_kv_cudnn: seq_len_kv_tensor,
+            self.q_ragged_offset_cudnn: q_ragged_offset_tensor,
+            self.k_ragged_offset_cudnn: k_ragged_offset_tensor,
+            self.v_ragged_offset_cudnn: v_ragged_offset_tensor,
+            self.o_ragged_offset_cudnn: o_ragged_offset_tensor,
+        }
+        if not self.is_infer:
+            variant_pack[self.stats_cudnn] = stats_tensor
+            variant_pack[self.stats_ragged_offset_cudnn] = stats_ragged_offset_tensor
+
+        workspace = torch.empty(
+            self._cudnn_swa_graph.get_workspace_size(),
+            device=q_tensor.device,
+            dtype=torch.uint8,
+        )
+        self._cudnn_swa_graph.execute(variant_pack, workspace, handle=cudnn_handle)
+        torch.cuda.synchronize()
+        self._logger.debug("Executed successfully")
+
+    def __call__(self, *args, **kwargs) -> None:
+        self.execute(*args, skip_compile=True, **kwargs)
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_SlidingWindowAttentionObjects = {}
+
+
+def sliding_window_attention_wrapper(
+    q_tensor: torch.Tensor,
+    k_tensor: torch.Tensor,
+    v_tensor: torch.Tensor,
+    seq_len_q_tensor: Optional[torch.Tensor] = None,
+    seq_len_kv_tensor: Optional[torch.Tensor] = None,
+    q_ragged_offset_tensor: Optional[torch.Tensor] = None,
+    k_ragged_offset_tensor: Optional[torch.Tensor] = None,
+    v_ragged_offset_tensor: Optional[torch.Tensor] = None,
+    o_ragged_offset_tensor: Optional[torch.Tensor] = None,
+    stats_ragged_offset_tensor: Optional[torch.Tensor] = None,
+    left_bound: int = 0,
+    right_bound: int = 0,
+    is_infer: bool = False,
+    attn_scale: Optional[float] = None,
+    o_dtype: Optional[torch.dtype] = None,
+    intermediate_data_type: torch.dtype = torch.float32,
+    compute_data_type: torch.dtype = torch.float32,
+    cudnn_handle: Optional[cudnn.handle] = None,
+    stream: Optional[cuda.CUstream] = None,
+):
+    o_tensor, stats_tensor = None, None
+    o_dtype = o_dtype if o_dtype is not None else q_tensor.dtype
+    if q_tensor.ndim == 3:  # thd
+        _logger.debug("sliding_window_attention_wrapper: Creating empty output tensor o for thd layout")
+        t, h_q, d = q_tensor.shape
+        _, h_k, d_v = v_tensor.shape
+        o_tensor = make_tensor_strided_like(q_tensor, (t, h_q, d_v), dtype=o_dtype, device=q_tensor.device)
+        if not is_infer:
+            _logger.debug("sliding_window_attention_wrapper: Creating empty output tensor stats for thd layout")
+            stats_tensor = make_tensor_strided_like(q_tensor, (t, h_q, 1), dtype=torch.float32, device=q_tensor.device)
+    else:  # bshd
+        _logger.debug("sliding_window_attention_wrapper: Creating empty output tensor o for bshd layout")
+        b, h_q, s_q, d = q_tensor.shape
+        _, h_k, s_k, d_v = v_tensor.shape
+        o_tensor = make_tensor_strided_like(q_tensor, (b, h_q, s_q, d_v), dtype=o_dtype, device=q_tensor.device)
+        if not is_infer:
+            _logger.debug("sliding_window_attention_wrapper: Creating empty output tensor stats for bshd layout")
+            stats_tensor = make_tensor_strided_like(q_tensor, (b, h_q, s_q, 1), dtype=torch.float32, device=q_tensor.device)
+
+    cache_key = (
+        q_tensor.shape,
+        k_tensor.shape,
+        v_tensor.shape,
+        seq_len_q_tensor.shape if seq_len_q_tensor is not None else None,
+        seq_len_kv_tensor.shape if seq_len_kv_tensor is not None else None,
+        q_ragged_offset_tensor.shape if q_ragged_offset_tensor is not None else None,
+        k_ragged_offset_tensor.shape if k_ragged_offset_tensor is not None else None,
+        v_ragged_offset_tensor.shape if v_ragged_offset_tensor is not None else None,
+        o_ragged_offset_tensor.shape if o_ragged_offset_tensor is not None else None,
+        (stats_ragged_offset_tensor.shape if stats_ragged_offset_tensor is not None else None),
+        q_tensor.stride(),
+        k_tensor.stride(),
+        v_tensor.stride(),
+        seq_len_q_tensor.stride() if seq_len_q_tensor is not None else None,
+        seq_len_kv_tensor.stride() if seq_len_kv_tensor is not None else None,
+        q_ragged_offset_tensor.stride() if q_ragged_offset_tensor is not None else None,
+        k_ragged_offset_tensor.stride() if k_ragged_offset_tensor is not None else None,
+        v_ragged_offset_tensor.stride() if v_ragged_offset_tensor is not None else None,
+        o_ragged_offset_tensor.stride() if o_ragged_offset_tensor is not None else None,
+        (stats_ragged_offset_tensor.stride() if stats_ragged_offset_tensor is not None else None),
+        q_tensor.dtype,
+        k_tensor.dtype,
+        v_tensor.dtype,
+        left_bound,
+        right_bound,
+        is_infer,
+        attn_scale,
+        intermediate_data_type,
+        compute_data_type,
+    )
+    sliding_window_attention_object = None
+    if cache_key in _cache_of_SlidingWindowAttentionObjects:
+        _logger.debug("sliding_window_attention_wrapper: Using previously cached SlidingWindowAttention object")
+        sliding_window_attention_object = _cache_of_SlidingWindowAttentionObjects[cache_key]
+
+        sliding_window_attention_object.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            v_tensor=v_tensor,
+            o_tensor=o_tensor,
+            stats_tensor=stats_tensor,
+            seq_len_q_tensor=seq_len_q_tensor,
+            seq_len_kv_tensor=seq_len_kv_tensor,
+            q_ragged_offset_tensor=q_ragged_offset_tensor,
+            k_ragged_offset_tensor=k_ragged_offset_tensor,
+            v_ragged_offset_tensor=v_ragged_offset_tensor,
+            o_ragged_offset_tensor=o_ragged_offset_tensor,
+            stats_ragged_offset_tensor=stats_ragged_offset_tensor,
+            current_stream=stream,
+            cudnn_handle=cudnn_handle,
+        )
+    else:
+        _logger.debug("sliding_window_attention_wrapper: No previously cached SlidingWindowAttention object found, creating new SlidingWindowAttention object")
+        sliding_window_attention_object = SlidingWindowAttention(
+            sample_q=q_tensor,
+            sample_k=k_tensor,
+            sample_v=v_tensor,
+            sample_o=o_tensor,
+            sample_stats=stats_tensor,
+            sample_seq_len_q=seq_len_q_tensor,
+            sample_seq_len_kv=seq_len_kv_tensor,
+            sample_q_ragged_offset=q_ragged_offset_tensor,
+            sample_k_ragged_offset=k_ragged_offset_tensor,
+            sample_v_ragged_offset=v_ragged_offset_tensor,
+            sample_o_ragged_offset=o_ragged_offset_tensor,
+            sample_stats_ragged_offset=stats_ragged_offset_tensor,
+            max_seq_len_q=(max(seq_len_q_tensor).item() if seq_len_q_tensor is not None else None),
+            max_seq_len_kv=(max(seq_len_kv_tensor).item() if seq_len_kv_tensor is not None else None),
+            left_bound=left_bound,
+            right_bound=right_bound,
+            attn_scale=attn_scale,
+            intermediate_data_type=intermediate_data_type,
+            compute_data_type=compute_data_type,
+            cudnn_handle=cudnn_handle,
+        )
+
+        assert sliding_window_attention_object.check_support()
+        sliding_window_attention_object.compile(current_stream=stream)
+        sliding_window_attention_object.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            v_tensor=v_tensor,
+            o_tensor=o_tensor,
+            stats_tensor=stats_tensor,
+            seq_len_q_tensor=seq_len_q_tensor,
+            seq_len_kv_tensor=seq_len_kv_tensor,
+            q_ragged_offset_tensor=q_ragged_offset_tensor,
+            k_ragged_offset_tensor=k_ragged_offset_tensor,
+            v_ragged_offset_tensor=v_ragged_offset_tensor,
+            o_ragged_offset_tensor=o_ragged_offset_tensor,
+            stats_ragged_offset_tensor=stats_ragged_offset_tensor,
+            current_stream=stream,
+        )
+        _cache_of_SlidingWindowAttentionObjects[cache_key] = sliding_window_attention_object
+
+    return o_tensor, stats_tensor
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sparse_attention.md b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sparse_attention.md
new file mode 100644
index 00000000..5d724d39
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/sparse_attention.md
@@ -0,0 +1,159 @@
+# Native Sparse Attention (NSA) Module
+
+The Native Sparse Attention (NSA) module implements Native Sparse attention as described in the [Native Sparse Attention: Hardware-Aligned and Natively
+Trainable Sparse Attention](https://arxiv.org/pdf/2502.11089).
+
+## File Structure
+
+Currently, only the selection component of NSA is implemented.
+
+```
+python/cudnn/native_sparse_attention/
+├── __init__.py                          # Main module initialization and NSA namespace
+├── sparse_attention.md                  # This documentation file  
+└── selection/                           # Selection attention implementation
+    ├── __init__.py                      # Selection module exports (SelectionAttention and SelectionAttentionWrapper)
+    ├── api.py                           # High-level API class and wrapper function
+    └── NSA_select_attn_fwd_hmma.py      # CuteDSL kernel implementation
+├── compression/                         # Compression attention implementation
+    ├── __init__.py                      # Compression module exports (CompressionAttention and CompressionAttentionWrapper)
+    ├── api.py                           # High-level API class and wrapper function
+    └── fmha.py                          # CuteDSL kernel implementation
+├── sliding_window/                      # Sliding window attention implementation
+    ├── __init__.py                      # Sliding window module exports (SlidingWindowAttention and SlidingWindowAttentionWrapper)
+    ├── api.py                           # High-level API class and wrapper function
+    └── NSA_swa_fwd_hmma.py              # CuteDSL kernel implementation
+└── top-k/                               # TODO, not implemented yet
+```
+
+## Installation
+
+Install the optional cudnn dependences required for NSA:
+```bash
+pip install nvidia-cudnn-frontend[cutedsl]
+```
+
+## Usage
+
+Sample usage and tests can be found in the (test/python) folder:
+- [test_NSA_selection_attention.py](test/python/test_NSA_selection_attention.py), `pytest test/python/test_NSA_selection_attention.py`
+- [test_NSA_compression_attention.py](test/python/test_NSA_compression_attention.py), `pytest test/python/test_NSA_compression_attention.py`
+- [test_NSA_swa.py](test/python/test_NSA_swa.py), `pytest test/python/test_NSA_swa.py`
+
+Once all components are implemented, we will offer a central NSA API that will do the full NSA computation end-to-end. We will also offer the individual components as standalone APIs, as demonstrated below.
+
+### Basic Usage with NSA Namespace
+
+```python
+import torch
+from cudnn import NSA
+
+# Prepare input tensors
+q = torch.randn((T, H_q, D), dtype=torch.bfloat16, device='cuda')
+k = torch.randn((T, H_kv, D), dtype=torch.bfloat16, device='cuda')
+v = torch.randn((T, H_kv, D_v), dtype=torch.bfloat16, device='cuda')
+
+# Define sparse attention pattern
+block_indices = torch.tensor(..., device='cuda')  # Selected block indices
+block_counts = torch.tensor(..., device='cuda')   # Number of blocks per sequence
+seq_offsets = torch.tensor(..., device='cuda')    # Sequence boundaries
+
+# Execute sparse attention
+o, l, m = NSA.SelectionAttentionWrapper(
+    q, k, v, block_indices, block_counts, seq_offsets,
+    block_size=64,
+    scale_softmax=None,  # Defaults to 1/sqrt(head_dim)
+    acc_dtype=torch.float32
+)
+```
+
+### Advanced Usage with Compiled Kernels
+
+```python
+import torch
+from cudnn import NSA
+
+# Prepare input and output tensors
+q = torch.randn((T, H_q, D), dtype=torch.bfloat16, device='cuda')
+k = torch.randn((T, H_kv, D), dtype=torch.bfloat16, device='cuda')
+v = torch.randn((T, H_kv, D_v), dtype=torch.bfloat16, device='cuda')
+o = torch.zeros((T, H_q, D_v), dtype=torch.bfloat16, device='cuda')
+l = torch.zeros((T, H_q), dtype=torch.float32, device='cuda')
+m = torch.zeros((T, H_q), dtype=torch.float32, device='cuda')
+
+# Define sparse attention pattern
+block_indices = torch.tensor(..., device='cuda')  # Selected block indices
+block_counts = torch.tensor(..., device='cuda')   # Number of blocks per sequence
+seq_offsets = torch.tensor(..., device='cuda')    # Sequence boundaries
+max_s = 8192
+
+# Create and configure selection attention instance. The sample input/output tensors are used for compilation, and must have the same dtype, shape, and strides as the actual input/output tensors.
+selection_attention = SelectionAttention(
+    sample_q=q,
+    sample_k=k, 
+    sample_v=v,
+    sample_o=o,
+    sample_l=l,
+    sample_m=m,
+    sample_block_indices=block_indices,
+    sample_block_counts=block_counts,
+    sample_seq_offsets=seq_offsets,
+    acc_dtype=torch.float32,
+    max_s=max_s,
+    block_size=64,
+    scale_softmax=None,
+)
+
+# Check hardware and configuration support
+assert selection_attention.check_support()
+
+# Compile kernel
+selection_attention.compile()
+
+# Execute on actual data
+selection_attention.execute(
+    q_tensor=q,
+    k_tensor=k,
+    v_tensor=v,
+    o_tensor=o,
+    l_tensor=l,
+    m_tensor=m,
+    block_indices_tensor=block_indices,
+    block_counts_tensor=block_counts,
+    seq_offsets_tensor=seq_offsets,
+    scale_softmax=None,
+)
+```
+
+## Tensor Formats
+
+### Input Tensors
+
+Selection Attention currently only supports T,H,D input format. B,H,S,D is not yet supported.
+
+#### T,H,D Format:
+- **Q (Query)**: `(T, H_q, D)`
+- **K (Key)**: `(T, H_kv, D)`  
+- **V (Value)**: `(T, H_kv, D_v)`
+- **block_indices**: `(num_blocks,)`
+- **block_counts**: `(batch_size,)`
+- **seq_offsets**: `(batch_size + 1,)`
+
+Compression Attention and SWA support both T,H,D and B,H,S,D input formats.
+#### B,H,S,D Format:
+- **Q (Query)**: `(B, H_q, S_q, D)`
+- **K (Key)**: `(B, H_kv, S_kv, D)`
+- **V (Value)**: `(B, H_kv, S_kv, D_v)`
+- **O (Output)**: `(B, H_q, S_q, D_v)`
+
+### Output Tensors
+
+#### T,H,D Format:
+- **O (Output)**: `(T, H_q, D_v)`
+- **L (LogSumExp)**: `(T, H_q)`
+- **M (Max)**: `(T, H_q)`
+
+#### B,H,S,D Format:
+- **O (Output)**: `(B, H_q, S_q, D_v)`
+- **L (LogSumExp)**: `(B, H_q, S_q)`
+- **M (Max)**: `(B, H_q, S_q)`
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/__init__.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/__init__.py
new file mode 100644
index 00000000..06dc3218
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/__init__.py
@@ -0,0 +1,6 @@
+from .api import TopKReduction, topk_reduction_wrapper
+
+__all__ = [
+    "TopKReduction",
+    "topk_reduction_wrapper",
+]
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/api.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/api.py
new file mode 100644
index 00000000..728aed1c
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/api.py
@@ -0,0 +1,452 @@
+from .nsa_top_k_reduction_fwd import FineGrainedReductionQK
+
+from cuda.bindings import driver as cuda
+import torch
+from typing import Tuple, Optional
+import math
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+
+from cudnn.datatypes import _convert_to_cutlass_data_type
+from cudnn.api_base import APIBase
+
+
+class TopKReduction(APIBase):
+    """
+    Top-K Reduction for Native Sparse Attention.
+
+    This class performs top-k reduction on attention scores to identify the most important
+    key-value pairs for each query position.
+
+    Note:
+        The returned values calculated by the kernel exclude the first block and neighboring blocks from the reduction.
+        As a result, it is expected to see rows of all -inf values and -1 values in the final topk_scores and topk_indices output tensors, respectively.
+    """
+
+    def __init__(
+        self,
+        sample_q: torch.Tensor,
+        sample_k: torch.Tensor,
+        sample_lse: torch.Tensor,
+        sample_topk_scores: torch.Tensor,
+        sample_topk_indices: torch.Tensor,
+        sample_cum_seqlen_q: Optional[torch.Tensor] = None,
+        sample_cum_seqlen_k: Optional[torch.Tensor] = None,
+        max_s_q: Optional[int] = None,
+        max_s_k: Optional[int] = None,
+        acc_dtype: torch.dtype = torch.float32,
+        k_value: int = 16,
+        selection_block_size: int = 64,
+        compress_stride: int = 32,
+        is_causal: bool = True,
+        mma_tiler_mn: Tuple[int, int] = (128, 128),
+        scale_softmax: Optional[float] = None,
+    ):
+        super().__init__()
+        self._kernel = FineGrainedReductionQK
+
+        self._logger.warning("TopKReduction is an experimental API")
+        self._logger.debug("Entering __init__")
+
+        self.sample_q = sample_q
+        self.sample_k = sample_k
+        self.sample_lse = sample_lse
+        self.sample_topk_scores = sample_topk_scores
+        self.sample_topk_indices = sample_topk_indices
+        self.sample_cum_seqlen_q = sample_cum_seqlen_q
+        self.sample_cum_seqlen_k = sample_cum_seqlen_k
+
+        self.max_s_q = max_s_q
+        self.max_s_k = max_s_k
+        self.acc_dtype = acc_dtype
+        self.k_value = k_value
+        self.selection_block_size = selection_block_size
+        self.compress_stride = compress_stride
+        self.is_causal = is_causal
+        self.mma_tiler_mn = mma_tiler_mn
+        self.scale_softmax = scale_softmax
+
+        # Derived attributes (TODO)
+
+    def check_support(self) -> bool:
+        self._logger.debug("Entering check_support")
+
+        # Shape normalization and validation
+        self._logger.debug("Checking shape normalization and validation")
+        if self.sample_cum_seqlen_q is None and self.sample_cum_seqlen_k is None:
+            self.input_layout = "B,H,S,D"
+
+        elif self.sample_cum_seqlen_q is not None and self.sample_cum_seqlen_k is not None:
+            self.input_layout = "T,H,D"
+
+            if self.sample_q.ndim == 3:
+                self._logger.info("reshaping q_tensor from T,H,D to 1,H,T,D")
+                self.sample_q = self.sample_q.unsqueeze(0).transpose(1, 2)
+            if self.sample_k.ndim == 3:
+                self._logger.info("reshaping k_tensor from T,H,D to 1,H,T,D")
+                self.sample_k = self.sample_k.unsqueeze(0).transpose(1, 2)
+            if self.sample_lse.ndim == 2:
+                self._logger.info("reshaping lse_tensor from T,H to 1,T,H")
+                self.sample_lse = self.sample_lse.unsqueeze(0).transpose(1, 2)
+            elif self.sample_lse.ndim == 3:
+                self._logger.info("reshaping lse_tensor from T,H,1 to 1,H,T")
+                self.sample_lse = self._unpad_tensor_to_ndim(self.sample_lse, 2, "sample_lse").unsqueeze(0).transpose(1, 2)
+            if self.sample_topk_scores.ndim == 3:
+                self._logger.info("reshaping topk_scores_tensor from T,H,D to 1,H,T,D")
+                self.sample_topk_scores = self.sample_topk_scores.unsqueeze(0).transpose(1, 2)
+            if self.sample_topk_indices.ndim == 3:
+                self._logger.info("reshaping topk_indices_tensor from T,H,D to 1,H,T,D")
+                self.sample_topk_indices = self.sample_topk_indices.unsqueeze(0).transpose(1, 2)
+            if self.sample_cum_seqlen_q.ndim != 1:
+                self._logger.info("cum_seqlen_q must be 1D tensor. Attempting to squeeze last dimension(s)")
+                for _ in range(self.sample_cum_seqlen_q.ndim - 1):
+                    self.sample_cum_seqlen_q = self.sample_cum_seqlen_q.squeeze(-1)
+                if self.sample_cum_seqlen_q.ndim != 1:
+                    raise ValueError(f"cum_seqlen_q must be 1D tensor, got {self.sample_cum_seqlen_q.ndim}D")
+            if self.sample_cum_seqlen_k.ndim != 1:
+                self._logger.info("cum_seqlen_k must be 1D tensor. Attempting to squeeze last dimension(s)")
+                for _ in range(self.sample_cum_seqlen_k.ndim - 1):
+                    self.sample_cum_seqlen_k = self.sample_cum_seqlen_k.squeeze(-1)
+                if self.sample_cum_seqlen_k.ndim != 1:
+                    raise ValueError(f"cum_seqlen_k must be 1D tensor, got {self.sample_cum_seqlen_k.ndim}D")
+            if self.max_s_q is None:
+                self._logger.warning("max_s_q not provided, inferring from cum_seqlen_q")
+                self.max_s_q = (self.sample_cum_seqlen_q[1:] - self.sample_cum_seqlen_q[:-1]).max().item()
+            if self.max_s_k is None:
+                self._logger.warning("max_s_k not provided, inferring from cum_seqlen_k")
+                self.max_s_k = (self.sample_cum_seqlen_k[1:] - self.sample_cum_seqlen_k[:-1]).max().item()
+        else:
+            raise ValueError(f"cum_seqlen_q and cum_seqlen_k must be None or both not None, got {self.sample_cum_seqlen_q} and {self.sample_cum_seqlen_k}")
+
+        b, h_q, s_q, d = self.sample_q.shape
+        b, h_k, s_k, d = self.sample_k.shape
+        if self.sample_q.shape != (b, h_q, s_q, d):
+            raise ValueError(f"Input shape mismatch: expected Q tensor shape {b, h_q, s_q, d}, got {self.sample_q.shape}")
+        if self.sample_k.shape != (b, h_k, s_k, d):
+            raise ValueError(f"Input shape mismatch: expected K tensor shape {b, h_k, s_k, d}, got {self.sample_k.shape}")
+        if self.sample_lse.shape == (b, h_q, s_q, 1):
+            self._logger.info("reshaping lse_tensor from (b, h_q, s_q, 1) to (b, h_q, s_q)")
+            self.sample_lse = self.sample_lse.squeeze(-1)
+        if self.sample_lse.shape != (b, h_q, s_q):
+            raise ValueError(f"Input shape mismatch: expected LSE tensor shape {b, h_q, s_q}, got {self.sample_lse.shape}")
+        if self.sample_lse.stride(-1) != 1:
+            self._logger.warning("lse_tensor is expected to have leading stride in last dimension of shape (b, h_q, s_q), copying lse_tensor to contiguous")
+            self.sample_lse = self.sample_lse.contiguous()
+        if self.sample_topk_scores.shape != (b, h_k, s_q, self.k_value):
+            raise ValueError(f"Input shape mismatch: expected TopK Scores tensor shape {b, h_k, s_q, self.k_value}, got {self.sample_topk_scores.shape}")
+        if self.sample_topk_indices.shape != (b, h_k, s_q, self.k_value):
+            raise ValueError(f"Input shape mismatch: expected TopK Indices tensor shape {b, h_k, s_q, self.k_value}, got {self.sample_topk_indices.shape}")
+
+        self.batch_size = b if (self.input_layout == "B,H,S,D") else (len(self.sample_cum_seqlen_q) - 1)
+        self.h_q, self.h_k, self.head_dim = h_q, h_k, d
+        if self.input_layout == "B,H,S,D":
+            self.max_s_q, self.max_s_k = s_q, s_k
+
+        self._logger.debug("Checking dtypes")
+        if self.sample_q.dtype != self.sample_k.dtype:
+            raise ValueError(f"Q and K must have the same dtype, got {self.sample_q.dtype} and {self.sample_k.dtype}")
+        self.dtype = self.sample_q.dtype
+        if self.sample_lse.dtype != self.acc_dtype:
+            raise ValueError(f"LSE and Accumulator must have the same dtype, got {self.sample_lse.dtype} and {self.acc_dtype}")
+        if self.sample_topk_scores.dtype != self.acc_dtype:
+            raise ValueError(f"TopK Scores and Accumulator must have the same dtype, got {self.sample_topk_scores.dtype} and {self.acc_dtype}")
+        if self.sample_topk_indices.dtype != torch.int32:
+            raise ValueError(f"TopK Indices must be int32, got {self.sample_topk_indices.dtype}")
+        if self.input_layout == "T,H,D":
+            if self.sample_cum_seqlen_q.dtype != torch.int32 or self.sample_cum_seqlen_k.dtype != torch.int32:
+                raise ValueError(
+                    f"cum_seqlen_q and cum_seqlen_k tensors must be int32, got {self.sample_cum_seqlen_q.dtype} and {self.sample_cum_seqlen_k.dtype}"
+                )
+
+        # Environment checks
+        self._logger.debug("Checking environment")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        device = torch.cuda.current_device()
+        major, minor = torch.cuda.get_device_capability(device)
+        compute_capability = major * 10 + minor
+        if compute_capability < 100:
+            raise RuntimeError(f"TopKReduction requires SM100+ compute capability, but found SM{compute_capability} on device {device}")
+        if compute_capability == 103:
+            raise RuntimeError("cuteDSL TopKReduction is not supported on SM103")
+
+        self._is_supported = True
+        self._logger.debug("check_support completed successfully")
+        return True
+
+    def compile(self, current_stream: Optional[cuda.CUstream] = None) -> None:
+        self._logger.debug("Entering compile")
+        current_stream = self._get_default_stream(current_stream)
+        self._ensure_support_checked()
+
+        mma_tiler = (*self.mma_tiler_mn, self.head_dim)
+
+        topk_reduction = self._kernel(
+            element_dtype=_convert_to_cutlass_data_type(self.dtype),
+            acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+            k_value=self.k_value,
+            selection_block_size=self.selection_block_size,
+            compress_block_sliding_stride=self.compress_stride,
+            mma_tiler=mma_tiler,
+            is_causal=self.is_causal,
+        )
+
+        scale_softmax = 1.0 / math.sqrt(self.head_dim) if self.scale_softmax is None else self.scale_softmax
+        log2_e = math.log2(math.e)
+        softmax_scale_log2_e = scale_softmax * log2_e
+        problem_size = (
+            self.batch_size,
+            self.max_s_q,
+            self.max_s_k,
+            self.h_q,
+            self.h_k,
+            self.head_dim,
+        )
+
+        sample_q_cute = from_dlpack(self.sample_q, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        sample_k_cute = from_dlpack(self.sample_k, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        sample_lse_cute = from_dlpack(self.sample_lse, assumed_align=16).mark_layout_dynamic(leading_dim=2)
+        sample_topk_scores_cute = from_dlpack(self.sample_topk_scores, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        sample_topk_indices_cute = from_dlpack(self.sample_topk_indices, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        sample_cum_seqlen_q_cute = from_dlpack(self.sample_cum_seqlen_q).mark_layout_dynamic() if self.input_layout == "T,H,D" else None
+        sample_cum_seqlen_k_cute = from_dlpack(self.sample_cum_seqlen_k).mark_layout_dynamic() if self.input_layout == "T,H,D" else None
+
+        self._compiled_kernel = cute.compile(
+            topk_reduction,
+            problem_size=problem_size,
+            Q=sample_q_cute,
+            K=sample_k_cute,
+            LSE=sample_lse_cute,
+            Topk_scores=sample_topk_scores_cute,
+            Topk_indices=sample_topk_indices_cute,
+            softmax_scale_log2_e=softmax_scale_log2_e,
+            cumulative_s_q=sample_cum_seqlen_q_cute,
+            cumulative_s_k=sample_cum_seqlen_k_cute,
+            stream=current_stream,
+        )
+        self._logger.debug("Kernel compiled successfully")
+
+    def execute(
+        self,
+        q_tensor: torch.Tensor,
+        k_tensor: torch.Tensor,
+        lse_tensor: torch.Tensor,
+        topk_scores_tensor: torch.Tensor,
+        topk_indices_tensor: torch.Tensor,
+        cumulative_s_q_tensor: Optional[torch.Tensor] = None,
+        cumulative_s_k_tensor: Optional[torch.Tensor] = None,
+        skip_compile: bool = False,
+        current_stream: Optional[cuda.CUstream] = None,
+    ) -> None:
+        self._logger.debug("Entering execute")
+        current_stream = self._get_default_stream(current_stream)
+
+        if self.input_layout == "T,H,D":
+            if cumulative_s_q_tensor is None or cumulative_s_k_tensor is None:
+                raise ValueError("cumulative_s_q_tensor and cumulative_s_k_tensor are required when using T,H,D layout")
+            if q_tensor.ndim == 3:
+                self._logger.info("reshaping q_tensor from T,H,D to 1,H,T,D")
+                q_tensor = q_tensor.unsqueeze(0).transpose(1, 2)
+            if k_tensor.ndim == 3:
+                self._logger.info("reshaping k_tensor from T,H,D to 1,H,T,D")
+                k_tensor = k_tensor.unsqueeze(0).transpose(1, 2)
+            if lse_tensor.ndim == 2:
+                self._logger.info("reshaping lse_tensor from T,H to 1,H,T")
+                lse_tensor = lse_tensor.unsqueeze(0).transpose(1, 2)
+            elif lse_tensor.ndim == 3:
+                self._logger.info("reshaping lse_tensor from T,H,1 to 1,H,T")
+                lse_tensor = self._unpad_tensor_to_ndim(lse_tensor, 2, "lse_tensor").unsqueeze(0).transpose(1, 2)
+            if topk_scores_tensor.ndim == 3:
+                self._logger.info("reshaping topk_scores_tensor from T,H,D to 1,H,T,D")
+                topk_scores_tensor = topk_scores_tensor.unsqueeze(0).transpose(1, 2)
+            if topk_indices_tensor.ndim == 3:
+                self._logger.info("reshaping topk_indices_tensor from T,H,D to 1,H,T,D")
+                topk_indices_tensor = topk_indices_tensor.unsqueeze(0).transpose(1, 2)
+
+        if lse_tensor.ndim == 4:
+            self._logger.info("reshaping lse_tensor to remove trailing dimension")
+            lse_tensor = lse_tensor.squeeze(-1)
+        if lse_tensor.stride(-1) != 1:
+            self._logger.warning("lse_tensor is expected to have leading stride in last dimension of shape (b, h_q, s_q), copying lse_tensor to contiguous")
+            lse_tensor = lse_tensor.contiguous()
+
+        q_cute = from_dlpack(q_tensor, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        k_cute = from_dlpack(k_tensor, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        lse_cute = from_dlpack(lse_tensor, assumed_align=16).mark_layout_dynamic(leading_dim=2)
+        topk_scores_cute = from_dlpack(topk_scores_tensor, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        topk_indices_cute = from_dlpack(topk_indices_tensor, assumed_align=16).mark_layout_dynamic(leading_dim=3)
+        cumulative_s_q_cute = from_dlpack(cumulative_s_q_tensor).mark_layout_dynamic() if self.input_layout == "T,H,D" else None
+        cumulative_s_k_cute = from_dlpack(cumulative_s_k_tensor).mark_layout_dynamic() if self.input_layout == "T,H,D" else None
+        scale_softmax = 1.0 / math.sqrt(self.head_dim) if self.scale_softmax is None else self.scale_softmax
+        log2_e = math.log2(math.e)
+        softmax_scale_log2_e = scale_softmax * log2_e
+        problem_size = (
+            self.batch_size,
+            self.max_s_q,
+            self.max_s_k,
+            self.h_q,
+            self.h_k,
+            self.head_dim,
+        )
+
+        if not skip_compile:
+            if self._compiled_kernel is None:
+                raise ValueError("TopKReduction kernel not compiled")
+            self._logger.debug("Executing with compiled kernel")
+            self._compiled_kernel(
+                problem_size=problem_size,
+                Q=q_cute,
+                K=k_cute,
+                LSE=lse_cute,
+                Topk_scores=topk_scores_cute,
+                Topk_indices=topk_indices_cute,
+                softmax_scale_log2_e=softmax_scale_log2_e,
+                cumulative_s_q=cumulative_s_q_cute,
+                cumulative_s_k=cumulative_s_k_cute,
+                stream=current_stream,
+            )
+            self._logger.debug("Executed with compiled kernel successfully")
+        else:
+            self._logger.debug("Executing without compiled kernel (JIT)")
+            topk_reduction = self._kernel(
+                element_dtype=_convert_to_cutlass_data_type(self.dtype),
+                acc_dtype=_convert_to_cutlass_data_type(self.acc_dtype),
+                k_value=self.k_value,
+                selection_block_size=self.selection_block_size,
+                compress_block_sliding_stride=self.compress_stride,
+                mma_tiler=(*self.mma_tiler_mn, self.head_dim),
+                is_causal=self.is_causal,
+            )
+            topk_reduction(
+                problem_size=problem_size,
+                Q=q_cute,
+                K=k_cute,
+                LSE=lse_cute,
+                Topk_scores=topk_scores_cute,
+                Topk_indices=topk_indices_cute,
+                softmax_scale_log2_e=softmax_scale_log2_e,
+                cumulative_s_q=cumulative_s_q_cute,
+                cumulative_s_k=cumulative_s_k_cute,
+                stream=current_stream,
+            )
+            self._logger.debug("Executed successfully")
+
+
+import logging
+
+_logger = logging.getLogger(__name__)
+_cache_of_TopKReductionObjects = {}
+
+
+def topk_reduction_wrapper(
+    q_tensor: torch.Tensor,
+    k_tensor: torch.Tensor,
+    lse_tensor: torch.Tensor,
+    cum_seqlen_q_tensor: Optional[torch.Tensor] = None,
+    cum_seqlen_k_tensor: Optional[torch.Tensor] = None,
+    max_s_q: Optional[int] = None,
+    max_s_k: Optional[int] = None,
+    acc_dtype: torch.dtype = torch.float32,
+    k_value: int = 16,
+    selection_block_size: int = 64,
+    compress_stride: int = 32,
+    is_causal: bool = True,
+    mma_tiler_mn: Tuple[int, int] = (128, 128),
+    scale_softmax: Optional[float] = None,
+    current_stream: Optional[cuda.CUstream] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    _logger.debug("topk_reduction_wrapper: Entering topk_reduction_wrapper")
+    topk_scores_tensor, topk_indices_tensor = None, None
+    if cum_seqlen_q_tensor is not None and cum_seqlen_k_tensor is not None:  # T,H,D
+        total_seq_len_q = cum_seqlen_q_tensor[-1].item()
+        h_k = k_tensor.shape[1]
+        topk_scores_tensor = torch.empty(total_seq_len_q, h_k, k_value, dtype=acc_dtype, device=q_tensor.device)
+        topk_indices_tensor = torch.empty(total_seq_len_q, h_k, k_value, dtype=torch.int32, device=q_tensor.device)
+    elif cum_seqlen_q_tensor is None and cum_seqlen_k_tensor is None:  # B,H,S,D
+        b, _, s_q, _ = q_tensor.shape
+        _, h_k, _, _ = k_tensor.shape
+        topk_scores_tensor = torch.empty(b, s_q, h_k, k_value, dtype=acc_dtype, device=q_tensor.device).transpose(1, 2)
+        topk_indices_tensor = torch.empty(b, s_q, h_k, k_value, dtype=torch.int32, device=q_tensor.device).transpose(1, 2)
+    else:
+        raise ValueError(
+            f"cum_seqlen_q_tensor and cum_seqlen_k_tensor must either both be None (B,H,S,D) or both not None (T,H,D), got {cum_seqlen_q_tensor} and {cum_seqlen_k_tensor}"
+        )
+
+    cache_key = (
+        q_tensor.shape,
+        k_tensor.shape,
+        lse_tensor.shape,
+        cum_seqlen_q_tensor.shape if cum_seqlen_q_tensor is not None else None,
+        cum_seqlen_k_tensor.shape if cum_seqlen_k_tensor is not None else None,
+        q_tensor.dtype,
+        k_tensor.dtype,
+        lse_tensor.dtype,
+        cum_seqlen_q_tensor.dtype if cum_seqlen_q_tensor is not None else None,
+        cum_seqlen_k_tensor.dtype if cum_seqlen_k_tensor is not None else None,
+        q_tensor.stride(),
+        k_tensor.stride(),
+        lse_tensor.stride(),
+        cum_seqlen_q_tensor.stride() if cum_seqlen_q_tensor is not None else None,
+        cum_seqlen_k_tensor.stride() if cum_seqlen_k_tensor is not None else None,
+        max_s_q,
+        max_s_k,
+        acc_dtype,
+        k_value,
+        selection_block_size,
+        compress_stride,
+        is_causal,
+        mma_tiler_mn,
+        scale_softmax,
+    )
+
+    if cache_key in _cache_of_TopKReductionObjects:
+        _logger.debug("topk_reduction_wrapper: Using previously cached TopKReduction object")
+        topk_reduction = _cache_of_TopKReductionObjects[cache_key]
+        topk_reduction.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            lse_tensor=lse_tensor,
+            topk_scores_tensor=topk_scores_tensor,
+            topk_indices_tensor=topk_indices_tensor,
+            cumulative_s_q_tensor=cum_seqlen_q_tensor,
+            cumulative_s_k_tensor=cum_seqlen_k_tensor,
+            current_stream=current_stream,
+        )
+        return topk_scores_tensor, topk_indices_tensor
+    else:
+        topk_reduction = TopKReduction(
+            sample_q=q_tensor,
+            sample_k=k_tensor,
+            sample_lse=lse_tensor,
+            sample_topk_scores=topk_scores_tensor,
+            sample_topk_indices=topk_indices_tensor,
+            sample_cum_seqlen_q=cum_seqlen_q_tensor,
+            sample_cum_seqlen_k=cum_seqlen_k_tensor,
+            max_s_q=max_s_q,
+            max_s_k=max_s_k,
+            acc_dtype=acc_dtype,
+            k_value=k_value,
+            selection_block_size=selection_block_size,
+            compress_stride=compress_stride,
+            is_causal=is_causal,
+            mma_tiler_mn=mma_tiler_mn,
+            scale_softmax=scale_softmax,
+        )
+        assert topk_reduction.check_support()
+        topk_reduction.compile(current_stream=current_stream)
+        topk_reduction.execute(
+            q_tensor=q_tensor,
+            k_tensor=k_tensor,
+            lse_tensor=lse_tensor,
+            topk_scores_tensor=topk_scores_tensor,
+            topk_indices_tensor=topk_indices_tensor,
+            cumulative_s_q_tensor=cum_seqlen_q_tensor,
+            cumulative_s_k_tensor=cum_seqlen_k_tensor,
+            current_stream=current_stream,
+        )
+        _cache_of_TopKReductionObjects[cache_key] = topk_reduction
+
+    return topk_scores_tensor, topk_indices_tensor
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/nsa_top_k_reduction_fwd.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/nsa_top_k_reduction_fwd.py
new file mode 100644
index 00000000..9d0c8c3e
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/top_k/nsa_top_k_reduction_fwd.py
@@ -0,0 +1,946 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+from typing import Tuple, Type, Optional
+import math
+
+import cuda.bindings.driver as cuda
+import torch
+from torch.profiler import profile, ProfilerActivity
+import time
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+from cutlass.cute.runtime import from_dlpack
+from cutlass.cute.typing import Int32, Float32, Int64
+
+from cutlass._mlir.dialects import cute_nvgpu
+
+"""
+A NSA(Native Sparse Attention) Top-K Reduction Forward Pass for NVIDIA Blackwell SM100 architecture using Cute DSL.
+"""
+
+
+class FineGrainedReductionQK:
+    def __init__(
+        self,
+        element_dtype: Type[cutlass.Numeric],
+        acc_dtype: Type[cutlass.Numeric],
+        k_value: int,
+        selection_block_size: int,
+        compress_block_sliding_stride: int,
+        mma_tiler: Tuple[int, int, int],
+        is_causal: bool = False,
+    ):
+        self.element_dtype = element_dtype
+        self.acc_dtype = acc_dtype
+        self.indices_dtype = Int32
+        self.k_value = k_value
+        self.selection_block_size = selection_block_size
+        self.compress_block_sliding_stride = compress_block_sliding_stride
+        self.num_elem_for_reduction = selection_block_size // compress_block_sliding_stride
+
+        self.cluster_shape_mn = (1, 1)
+        self.mma_tiler = mma_tiler
+        self.is_causal = is_causal
+
+        self.compute_warp_id = (0, 1, 2, 3)
+        # self.reduce_warp_id = (4, 5, 6, 7)
+        self.load_warp_id = 4
+        self.mma_warp_id = 5
+        self.epi_warp_id = 6
+        self.threads_per_warp = 32
+        self.num_compute_warps = len(self.compute_warp_id)
+        # self.num_reduce_warps = len(self.reduce_warp_id)
+
+        self.threads_per_cta = self.threads_per_warp * (self.num_compute_warps + 4)
+
+        self.cta_sync_bar_id = 0
+        self.tmem_alloc_sync_bar_id = 1
+        self.compute_sync_bar_id = 2
+        # self.reduce_sync_bar_id = 3
+        # self.num_regs_compute = 240
+        # self.num_regs_reduce = 96
+        # self.num_regs_other = 72
+        self.num_regs_compute = 256
+        # self.num_regs_reduce = 96
+        self.num_regs_other = 96
+
+    @cute.jit
+    def __call__(
+        self,
+        problem_size: Tuple[Int32, Int32, Int32, Int32, Int32, Int32],
+        Q: cute.Tensor,
+        K: cute.Tensor,
+        LSE: cute.Tensor,
+        Topk_scores: cute.Tensor,
+        Topk_indices: cute.Tensor,
+        softmax_scale_log2_e: Float32,
+        cumulative_s_q: Optional[cute.Tensor],
+        cumulative_s_k: Optional[cute.Tensor],
+        stream: cuda.CUstream,
+    ):
+        b, s_q_max, s_k_max, _, _, _ = problem_size
+        s_q, s_k = Q.shape[2], K.shape[2]
+        h_q, h_k = Q.shape[1], K.shape[1]
+        head_dim = Q.shape[3]
+        h_r = h_q // h_k
+
+        stride_b_q = s_q * head_dim * h_k * h_r if cumulative_s_q is None else 0
+        stride_b_k = s_k * head_dim * h_k if cumulative_s_k is None else 0
+        stride_b_lse = s_q * h_r * h_k if cumulative_s_q is None else 0
+        stride_b_out = s_q * (s_k_max // self.num_elem_for_reduction) * h_k if cumulative_s_q is None else 0
+        stride_b_topk_scores = s_q * self.k_value * h_k if cumulative_s_q is None else 0
+        stride_b_topk_indices = s_q * self.k_value * h_k if cumulative_s_q is None else 0
+
+        Q = cute.make_tensor(
+            Q.iterator,
+            cute.make_layout(
+                (s_q, head_dim, (h_r, h_k, b)),
+                stride=(
+                    head_dim * h_r * h_k,
+                    1,
+                    (
+                        head_dim * h_k,
+                        head_dim,
+                        stride_b_q,
+                    ),  # TODO: head ordering is diff?
+                ),
+            ),
+        )
+
+        K = cute.make_tensor(
+            K.iterator,
+            cute.make_layout(
+                (s_k, head_dim, (1, h_k, b)),
+                stride=(
+                    head_dim * h_k,
+                    1,
+                    (0, head_dim, stride_b_k),
+                ),
+            ),
+        )
+
+        # reshape LSE to (s_q, h_r, h_k, b)
+        LSE = cute.make_tensor(
+            LSE.iterator,
+            cute.make_layout(
+                (s_q, h_r, h_k, b),
+                stride=(1, s_q * h_k, s_q, stride_b_lse),
+            ),
+        )
+
+        Topk_scores = cute.make_tensor(
+            Topk_scores.iterator,
+            cute.make_layout(
+                (s_q, self.k_value, (1, h_k, b)),
+                stride=(self.k_value * h_k, 1, (0, self.k_value, stride_b_topk_scores)),
+            ),
+        )
+
+        Topk_indices = cute.make_tensor(
+            Topk_indices.iterator,
+            cute.make_layout(
+                (s_q, self.k_value, (1, h_k, b)),  # (s_q, k, h_k, b)
+                stride=(
+                    self.k_value * h_k,
+                    1,
+                    (0, self.k_value, stride_b_topk_indices),
+                ),
+            ),
+        )
+
+        self.q_major_mode = utils.LayoutEnum.from_tensor(Q).mma_major_mode()
+        self.k_major_mode = utils.LayoutEnum.from_tensor(K).mma_major_mode()
+        self.load_mma_Q_stage = 2
+        self.load_mma_K_stage = 1
+        self.load_compute_LSE_stage = 1
+        self.mma_compute_S_stage = 1
+        cta_group = tcgen05.CtaGroup.ONE
+
+        QK_tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            self.element_dtype,
+            self.q_major_mode,
+            self.k_major_mode,
+            self.acc_dtype,
+            cta_group,
+            self.mma_tiler[:2],
+        )
+        Q_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            QK_tiled_mma,
+            self.mma_tiler,
+            self.element_dtype,
+            self.load_mma_Q_stage,
+        )
+        K_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            QK_tiled_mma,
+            self.mma_tiler,
+            self.element_dtype,
+            self.load_mma_K_stage,
+        )
+        LSE_smem_layout = cute.make_layout((self.mma_tiler[0], 1))
+
+        self.epi_tile = (self.mma_tiler[0], self.k_value)
+        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout(self.cluster_shape_mnk),
+            (QK_tiled_mma.thr_id.shape,),
+        )
+
+        tma_load_op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(cta_group)
+        tma_store_op = cute.nvgpu.cpasync.CopyBulkTensorTileS2GOp()
+
+        Q_smem_layout = cute.select(Q_smem_layout_staged, mode=[0, 1, 2])
+        K_smem_layout = cute.select(K_smem_layout_staged, mode=[0, 1, 2])
+
+        tma_atom_Q, tma_tensor_Q = cute.nvgpu.make_tiled_tma_atom_A(
+            tma_load_op,
+            Q,
+            Q_smem_layout,
+            self.mma_tiler,
+            QK_tiled_mma,
+            self.cluster_layout_vmnk,
+        )
+        tma_atom_K, tma_tensor_K = cute.nvgpu.make_tiled_tma_atom_B(
+            tma_load_op,
+            K,
+            K_smem_layout,
+            self.mma_tiler,
+            QK_tiled_mma,
+            self.cluster_layout_vmnk,
+        )
+
+        self.tma_copy_Q_bytes = cute.size_in_bytes(self.element_dtype, Q_smem_layout)
+        self.tma_copy_K_bytes = cute.size_in_bytes(self.element_dtype, K_smem_layout)
+
+        @cute.struct
+        class SharedStorage:
+            load_mma_Q_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.load_mma_Q_stage * 2]
+            load_mma_K_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.load_mma_K_stage * 2]
+            load_compute_LSE_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.load_compute_LSE_stage * 2]
+            mma_compute_S_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.mma_compute_S_stage * 2]
+
+            sQ: cute.struct.Align[
+                cute.struct.MemRange[self.element_dtype, cute.cosize(Q_smem_layout_staged)],
+                1024,
+            ]
+            sK: cute.struct.Align[
+                cute.struct.MemRange[self.element_dtype, cute.cosize(K_smem_layout_staged)],
+                1024,
+            ]
+            sLSE: cute.struct.Align[
+                cute.struct.MemRange[self.acc_dtype, cute.cosize(LSE_smem_layout)],
+                1024,
+            ]
+
+            tmem_holding_buf: Int32
+
+        self.shared_storage = SharedStorage
+
+        grid_shape = (cute.ceil_div(s_q_max, self.mma_tiler[0]), h_k, b)
+        block_shape = (self.threads_per_cta, 1, 1)
+
+        self.kernel(
+            problem_size,
+            QK_tiled_mma,
+            tma_atom_Q,
+            tma_tensor_Q,
+            tma_atom_K,
+            tma_tensor_K,
+            LSE,
+            cumulative_s_q,
+            cumulative_s_k,
+            Topk_scores,
+            Topk_indices,
+            Q_smem_layout_staged,
+            K_smem_layout_staged,
+            LSE_smem_layout,
+            softmax_scale_log2_e,
+        ).launch(
+            grid=grid_shape,
+            block=block_shape,
+            smem=SharedStorage.size_in_bytes(),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+
+    def make_and_init_load_mma_Q_pipeline(self, load_mma_Q_mbar_ptr):
+        load_mma_Q_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, len([self.load_warp_id]))
+        load_mma_Q_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, len([self.mma_warp_id]))
+        return pipeline.PipelineTmaUmma.create(
+            barrier_storage=load_mma_Q_mbar_ptr,
+            num_stages=self.load_mma_Q_stage,
+            producer_group=load_mma_Q_producer_group,
+            consumer_group=load_mma_Q_consumer_group,
+            tx_count=self.tma_copy_Q_bytes,
+        )
+
+    def make_and_init_load_mma_K_pipeline(self, load_mma_K_mbar_ptr):
+        load_mma_K_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, len([self.load_warp_id]))
+        load_mma_K_consumer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread, len([self.mma_warp_id]))
+        return pipeline.PipelineTmaUmma.create(
+            barrier_storage=load_mma_K_mbar_ptr,
+            num_stages=self.load_mma_K_stage,
+            producer_group=load_mma_K_producer_group,
+            consumer_group=load_mma_K_consumer_group,
+            tx_count=self.tma_copy_K_bytes,
+        )
+
+    def make_and_init_load_compute_LSE_pipeline(self, load_compute_lse_mbar_ptr):
+        load_compute_lse_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp,
+        )
+        load_compute_lse_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * self.num_compute_warps,
+        )
+        return pipeline.PipelineCpAsync.create(
+            barrier_storage=load_compute_lse_mbar_ptr,
+            num_stages=self.load_compute_LSE_stage,
+            producer_group=load_compute_lse_producer_group,
+            consumer_group=load_compute_lse_consumer_group,
+        )
+
+    def make_and_init_mma_compute_S_pipeline(self, mma_compute_S_mbar_ptr):
+        mma_compute_S_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            len([self.mma_warp_id]),
+        )
+        mma_compute_S_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.num_compute_warps * self.threads_per_warp,
+        )
+        return pipeline.PipelineUmmaAsync.create(
+            barrier_storage=mma_compute_S_mbar_ptr,
+            num_stages=self.mma_compute_S_stage,
+            producer_group=mma_compute_S_producer_group,
+            consumer_group=mma_compute_S_consumer_group,
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        problem_size: Tuple[Int32, Int32, Int32, Int32, Int32, Int32],
+        QK_tiled_mma: cute.TiledMma,
+        tma_atom_Q: cute.CopyAtom,
+        tma_tensor_Q: cute.Tensor,
+        tma_atom_K: cute.CopyAtom,
+        tma_tensor_K: cute.Tensor,
+        LSE: cute.Tensor,
+        cumulative_s_q: Optional[cute.Tensor],
+        cumulative_s_k: Optional[cute.Tensor],
+        Topk_scores: cute.Tensor,
+        Topk_indices: cute.Tensor,
+        Q_smem_layout_staged: cute.ComposedLayout,
+        K_smem_layout_staged: cute.ComposedLayout,
+        LSE_smem_layout: cute.Layout,
+        softmax_scale_log2_e: Float32,
+    ):
+        tidx, tidy, _ = cute.arch.thread_idx()
+        bidx, bidy, bidz = cute.arch.block_idx()
+
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        b, s_q_max, s_k_max, h_q, h_k, head_dim = problem_size
+        h_r = h_q // h_k
+
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        load_mma_Q_pipeline = self.make_and_init_load_mma_Q_pipeline(storage.load_mma_Q_mbar_ptr.data_ptr())
+        load_mma_K_pipeline = self.make_and_init_load_mma_K_pipeline(storage.load_mma_K_mbar_ptr.data_ptr())
+        load_compute_LSE_pipeline = self.make_and_init_load_compute_LSE_pipeline(storage.load_compute_LSE_mbar_ptr.data_ptr())
+        mma_compute_S_pipeline = self.make_and_init_mma_compute_S_pipeline(storage.mma_compute_S_mbar_ptr.data_ptr())
+
+        cute.arch.barrier(barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta)
+
+        sQ = storage.sQ.get_tensor(Q_smem_layout_staged.outer, swizzle=Q_smem_layout_staged.inner)
+        sK = storage.sK.get_tensor(K_smem_layout_staged.outer, swizzle=K_smem_layout_staged.inner)
+        sLSE = storage.sLSE.get_tensor(LSE_smem_layout)
+
+        block_offset = (Int32(0), Int32(0), Int32(0), (Int32(0), Int32(0), Int32(0)))
+        cur_s_q = s_q_max
+        cur_s_k = s_k_max
+        if cutlass.const_expr(cumulative_s_q is not None):
+            cur_s_q = cumulative_s_q[bidz + 1] - cumulative_s_q[bidz]
+            cur_s_k = cumulative_s_k[bidz + 1] - cumulative_s_k[bidz]
+            block_offset = (
+                cumulative_s_q[bidz],
+                cumulative_s_k[bidz],
+                Int32(0),
+                (Int32(0), Int32(0), Int32(0)),
+            )
+
+        k_tile_count = cute.ceil_div(cur_s_k, self.mma_tiler[1])
+        k_tile_idx = 0
+        if cutlass.const_expr(self.is_causal):
+            q_max = min(cur_s_q, (bidx + 1) * self.mma_tiler[0])
+            k_cols = min(cur_s_k, q_max // self.compress_block_sliding_stride)
+            k_tile_count = cute.ceil_div(k_cols, self.mma_tiler[1])
+
+        mQ = cute.domain_offset(cute.select(block_offset, mode=[0, 2, 3]), tma_tensor_Q)
+        mK = cute.domain_offset(cute.select(block_offset, mode=[1, 2, 3]), tma_tensor_K)
+        mTopk_scores = cute.make_tensor(
+            Topk_scores.iterator + cute.assume(block_offset[0] * Topk_scores.stride[0], divby=self.k_value),
+            Topk_scores.layout,
+        )
+        mTopk_indices = cute.make_tensor(
+            Topk_indices.iterator + cute.assume(block_offset[0] * Topk_indices.stride[0], divby=self.k_value),
+            Topk_indices.layout,
+        )
+
+        # (MMA_M, MMA_K, REST_M, REST_K, (H_r, H_k, B))
+        gQ = cute.local_tile(mQ, cute.select(self.mma_tiler, mode=[0, 2]), (None, None, None))
+        # (MMA_N, MMA_K, REST_N, REST_K, (1, H_k, B))
+        gK = cute.local_tile(mK, cute.select(self.mma_tiler, mode=[0, 2]), (None, None, None))
+
+        # (MMA_M, MMA_K, H_r)
+        gQ = gQ[None, None, bidx, 0, (None, bidy, bidz)]
+        # (MMA_N, MMA_K, REST_N)
+        gK = gK[None, None, None, 0, (0, bidy, bidz)]
+
+        thr_mma = QK_tiled_mma.get_slice(0)
+
+        tSgQ = thr_mma.partition_A(gQ)
+        tSgK = thr_mma.partition_B(gK)
+
+        # tQgQ: (MMA, H_r)
+        # tQsQ: (MMA, PIPE)
+        tQsQ, tQgQ = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_Q,
+            0,
+            cute.make_layout(1),
+            cute.group_modes(sQ, 0, 3),
+            cute.group_modes(tSgQ, 0, 3),
+        )
+
+        # tKgK: (MMA, REST_N)
+        # tKsK: (MMA, PIPE)
+        tKsK, tKgK = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_K,
+            0,
+            cute.make_layout(1),
+            cute.group_modes(sK, 0, 3),
+            cute.group_modes(tSgK, 0, 3),
+        )
+
+        tSrQ = QK_tiled_mma.make_fragment_A(sQ)
+        tSrK = QK_tiled_mma.make_fragment_B(sK)
+        tStS_shape = QK_tiled_mma.partition_shape_C(cute.select(self.mma_tiler, mode=[0, 1]))
+        # ((MMA_M, MMA_N), REST_M, REST_N)
+        tStS = QK_tiled_mma.make_fragment_C(tStS_shape)
+        # another tmem for reduction
+        tStS_reduce = cute.make_tensor(tStS.iterator + self.mma_tiler[1], tStS.layout)
+
+        if bidx * self.mma_tiler[0] < cur_s_q and k_tile_count > 0:
+
+            load_iter_count = k_tile_count
+            load_iter_index = k_tile_idx
+            mma_iter_count = k_tile_count
+            mma_iter_index = k_tile_idx
+            compute_iter_count = k_tile_count
+            compute_iter_index = k_tile_idx
+
+            # LOAD Q K WARP
+            if warp_idx == self.load_warp_id:
+                # TODO: reconfig regs
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                load_mma_Q_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.load_mma_Q_stage)
+                load_mma_K_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.load_mma_K_stage)
+                load_compute_LSE_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.load_compute_LSE_stage)
+
+                while load_iter_count > 0:
+                    # Wait for K to be empty
+                    load_mma_K_pipeline.producer_acquire(load_mma_K_producer_state)
+                    K_tma_barrier = load_mma_K_pipeline.producer_get_barrier(load_mma_K_producer_state)
+
+                    # Load K tile
+                    cute.copy(
+                        tma_atom_K,
+                        tKgK[None, load_iter_index],
+                        tKsK[None, load_mma_K_producer_state.index],
+                        tma_bar_ptr=K_tma_barrier,
+                    )
+
+                    load_mma_K_producer_state.advance()
+
+                    # Load Q and LSE
+                    for h_r_idx in cutlass.range(cute.size(tQgQ, mode=[1])):
+
+                        load_compute_LSE_pipeline.producer_acquire(load_compute_LSE_producer_state)
+
+                        # Load LSE
+                        thread_idx = tidx % self.threads_per_warp
+
+                        async_copy_num_elts = sLSE.shape[0] // self.threads_per_warp
+                        atom_async_copy = cute.make_copy_atom(
+                            cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.ALWAYS),
+                            self.acc_dtype,
+                            num_bits_per_copy=self.acc_dtype.width,
+                        )
+
+                        sLSE_for_copy = cute.flat_divide(sLSE, (1,))
+                        LSE_for_copy = cute.flat_divide(LSE, (1,))
+                        LSE_idx_offset = block_offset[0] * LSE.stride[0]
+                        for i in cutlass.range_constexpr(async_copy_num_elts):
+                            LSE_idx = self.mma_tiler[0] * bidx + thread_idx * async_copy_num_elts
+                            if cute.elem_less(LSE_idx + i, cur_s_q):
+                                cute.copy(
+                                    atom_async_copy,
+                                    LSE_for_copy[
+                                        None,
+                                        LSE_idx_offset + LSE_idx + i,
+                                        h_r_idx,
+                                        bidy,
+                                        bidz,
+                                    ],
+                                    sLSE_for_copy[
+                                        None,
+                                        thread_idx * async_copy_num_elts + i,
+                                        load_compute_LSE_producer_state.index,
+                                    ],
+                                )
+                            else:
+                                sLSE_for_copy[
+                                    None,
+                                    thread_idx * async_copy_num_elts + i,
+                                    load_compute_LSE_producer_state.index,
+                                ].fill(0.0)
+
+                        load_compute_LSE_pipeline.producer_commit(load_compute_LSE_producer_state)
+                        load_compute_LSE_producer_state.advance()
+
+                        # Wait for Q to be empty
+                        load_mma_Q_pipeline.producer_acquire(load_mma_Q_producer_state)
+                        Q_tma_barrier = load_mma_Q_pipeline.producer_get_barrier(load_mma_Q_producer_state)
+
+                        # Load Q tile
+                        cute.copy(
+                            tma_atom_Q,
+                            tQgQ[None, h_r_idx],
+                            tQsQ[None, load_mma_Q_producer_state.index],
+                            tma_bar_ptr=Q_tma_barrier,
+                        )
+
+                        load_mma_Q_producer_state.advance()
+
+                    load_iter_count -= 1
+                    load_iter_index += 1
+
+            # MMA WARP
+            if warp_idx == self.mma_warp_id:
+                # TODO: reconfig regs
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                num_tmem_cols = 512
+                cute.arch.alloc_tmem(num_tmem_cols, storage.tmem_holding_buf)
+                cute.arch.barrier(
+                    barrier_id=self.tmem_alloc_sync_bar_id,
+                    number_of_threads=self.threads_per_warp,
+                )
+
+                load_mma_Q_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.load_mma_Q_stage)
+                load_mma_K_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.load_mma_K_stage)
+                mma_compute_S_producer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Producer, self.mma_compute_S_stage)
+
+                while mma_iter_count > 0:
+                    # Wait for K to be full
+                    load_mma_K_pipeline.consumer_wait(load_mma_K_consumer_state)
+
+                    for h_r_idx in cutlass.range(cute.size(tQgQ, mode=[1])):
+                        # Wait for Q to be full
+                        load_mma_Q_pipeline.consumer_wait(load_mma_Q_consumer_state)
+                        mma_compute_S_pipeline.producer_acquire(mma_compute_S_producer_state)
+
+                        QK_tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, False)
+                        for k_block_idx in cutlass.range_constexpr(cute.size(tSrQ, mode=[2])):
+                            cute.gemm(
+                                QK_tiled_mma,
+                                tStS,
+                                tSrQ[
+                                    None,
+                                    None,
+                                    k_block_idx,
+                                    load_mma_Q_consumer_state.index,
+                                ],
+                                tSrK[
+                                    None,
+                                    None,
+                                    k_block_idx,
+                                    load_mma_K_consumer_state.index,
+                                ],
+                                tStS,
+                            )
+                            QK_tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)
+
+                        mma_compute_S_pipeline.producer_commit(mma_compute_S_producer_state)
+                        mma_compute_S_producer_state.advance()
+
+                        load_mma_Q_pipeline.consumer_release(load_mma_Q_consumer_state)
+                        load_mma_Q_consumer_state.advance()
+
+                    mma_iter_count -= 1
+                    mma_iter_index += 1
+
+                    load_mma_K_pipeline.consumer_release(load_mma_K_consumer_state)
+                    load_mma_K_consumer_state.advance()
+
+            # COMPUTE WARP
+            if warp_idx in self.compute_warp_id:
+                cute.arch.warpgroup_reg_alloc(self.num_regs_compute)
+
+                mma_compute_S_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.mma_compute_S_stage)
+                load_compute_LSE_consumer_state = pipeline.make_pipeline_state(pipeline.PipelineUserType.Consumer, self.load_compute_LSE_stage)
+
+                load_compute_LSE_pipeline.consumer_wait(load_compute_LSE_consumer_state)
+                thread_idx = tidx % (self.threads_per_warp * self.num_compute_warps)
+
+                heap_size_ref = cute.make_rmem_tensor((1,), Int32)
+                heap_size_ref[0] = 0
+                # # Create temporary register heaps for computation
+                scores_heap_rf = cute.make_rmem_tensor(((4, self.k_value // 4), 1, 1), Float32)
+                idx_heap_rf = cute.make_rmem_tensor(((4, self.k_value // 4), 1, 1), Int32)
+
+                tmem_load_atom = cute.make_copy_atom(
+                    tcgen05.Ld32x32bOp(tcgen05.copy.Repetition(32)),
+                    self.acc_dtype,
+                )
+                tmem_store_atom = cute.make_copy_atom(
+                    tcgen05.St32x32bOp(tcgen05.copy.Repetition(32)),
+                    self.acc_dtype,
+                )
+
+                cS = cute.make_identity_tensor((self.mma_tiler[0], self.mma_tiler[1]))
+
+                comp_tile_size = 32
+                tStS_tiled = cute.logical_divide(tStS, cute.make_layout((self.mma_tiler[0], comp_tile_size)))
+                tStS_compute_tiled = cute.logical_divide(tStS_reduce, cute.make_layout((self.mma_tiler[0], comp_tile_size)))
+                cS_tiled = cute.logical_divide(cS, cute.make_layout((self.mma_tiler[0], comp_tile_size)))
+
+                tStS_slice = tStS_tiled[None, 0]  # ((128, 16), 8)
+                tStS_compute_slice = tStS_compute_tiled[None, 0]
+
+                # (MMA_M, MMA_N)
+                tiled_t2r = tcgen05.make_tmem_copy(tmem_load_atom, tStS_slice)
+                thr_t2r = tiled_t2r.get_slice(thread_idx)
+
+                tTR_cS = thr_t2r.partition_D(cS_tiled)
+                tTR_tS = thr_t2r.partition_S(tStS_tiled)
+                tTR_tS_compute = thr_t2r.partition_S(tStS_compute_tiled)
+                tTR_rS = cute.make_rmem_tensor(tTR_cS[None, None, 0].shape, self.acc_dtype)
+                tTR_rS_compute = cute.make_rmem_tensor(tTR_cS[None, None, 0].shape, self.acc_dtype)
+
+                tiled_r2t = tcgen05.make_tmem_copy(tmem_store_atom, tStS_compute_slice)
+                thr_r2t = tiled_r2t.get_slice(thread_idx)
+                tRT_tS_compute = thr_r2t.partition_D(tStS_compute_tiled)
+
+                tiled_t2r_reduce = tcgen05.make_tmem_copy(tmem_load_atom, tStS[(None, None), 0, 0])
+                thr_t2r_reduce = tiled_t2r_reduce.get_slice(thread_idx)
+                tTR_tS_reduce = thr_t2r_reduce.partition_S(tStS_reduce[(None, None), 0, 0])
+                tTR_cS_reduce = thr_t2r_reduce.partition_D(cS)
+                tTR_rS_reduce = cute.make_rmem_tensor(tTR_cS_reduce.shape, self.acc_dtype)
+
+                tmp = cute.make_rmem_tensor((self.mma_tiler[1] // self.num_elem_for_reduction), self.acc_dtype)
+
+                while compute_iter_count > 0:
+
+                    for h_r_idx in range(cute.size(tQgQ, mode=[1])):
+                        mma_compute_S_pipeline.consumer_wait(mma_compute_S_consumer_state)
+
+                        # TODO: Added this as we should wait for the producer to load
+                        load_compute_LSE_pipeline.consumer_wait(load_compute_LSE_consumer_state)
+
+                        for sub_tile in cutlass.range(self.mma_tiler[1] // comp_tile_size):
+                            tTR_tS_sub_tile = tTR_tS[None, None, sub_tile]
+                            tTR_tS_compute_sub_tile = tTR_tS_compute[None, None, sub_tile]
+                            tRT_tS_compute_sub_tile = tRT_tS_compute[None, None, sub_tile]
+                            tTR_cS_sub_tile = tTR_cS[None, None, sub_tile]
+
+                            # Copy S from tmem to rmem
+                            cute.copy(tiled_t2r, tTR_tS_sub_tile, tTR_rS)
+
+                            is_residual_k = compute_iter_index * self.mma_tiler[1] + self.mma_tiler[1] > cur_s_k
+
+                            leading_causal_masking = cutlass.Boolean(False)
+                            if cutlass.const_expr(self.is_causal):
+                                leading_causal_masking = (
+                                    (compute_iter_index + 1) * self.mma_tiler[1] + 1
+                                ) * self.compress_block_sliding_stride - 1 > bidx * self.mma_tiler[0]
+                                leading_causal_masking = cute.arch.shuffle_sync(leading_causal_masking, 0)
+                            trailing_residual_masking = cutlass.Boolean(False)
+                            trailing_residual_masking = is_residual_k
+                            trailing_residual_masking = cute.arch.shuffle_sync(trailing_residual_masking, 0)
+
+                            is_masked_tile = leading_causal_masking or trailing_residual_masking
+
+                            # Apply mask
+                            if is_masked_tile:
+                                for i in cutlass.range(cute.size(tTR_rS), unroll_full=True):
+                                    q_idx = cute.get(tTR_cS_sub_tile[i], mode=[0]) + bidx * self.mma_tiler[0]
+                                    k_block_idx = cute.get(tTR_cS_sub_tile[i], mode=[1]) + compute_iter_index * self.mma_tiler[1]
+
+                                    if is_masked_tile:
+                                        if cutlass.const_expr(self.is_causal):
+                                            k_idx = (k_block_idx + 1) * self.compress_block_sliding_stride - 1
+                                            if k_idx > q_idx:
+                                                tTR_rS[i] = -cutlass.Float32.inf
+                                            if q_idx > cur_s_q or k_block_idx > cur_s_k:
+                                                tTR_rS[i] = -cutlass.Float32.inf
+                                        else:
+                                            if q_idx > cur_s_q or k_block_idx > cur_s_k:
+                                                tTR_rS[i] = -cutlass.Float32.inf
+
+                            # P = exp2(S * softmax_scale_log2_e - LSE),
+                            # LSE should be set negative before and has be already multiplied by log2_e
+
+                            # Copy S_reduce from tmem to rmem
+                            cute.copy(tiled_t2r, tTR_tS_compute_sub_tile, tTR_rS_compute)
+
+                            for i in cutlass.range(0, cute.size(tTR_rS, mode=[0]), 2):
+                                lse = (
+                                    sLSE[
+                                        cute.get(tTR_cS_sub_tile[i], mode=[0]),
+                                        load_compute_LSE_consumer_state.index,
+                                    ],
+                                    sLSE[
+                                        cute.get(tTR_cS_sub_tile[i + 1], mode=[0]),
+                                        load_compute_LSE_consumer_state.index,
+                                    ],
+                                )
+
+                                tTR_rS[i], tTR_rS[i + 1] = cute.arch.fma_packed_f32x2(
+                                    (tTR_rS[i], tTR_rS[i + 1]),
+                                    (
+                                        softmax_scale_log2_e,
+                                        softmax_scale_log2_e,
+                                    ),
+                                    lse,
+                                )
+                                tTR_rS[i] = cute.math.exp2(tTR_rS[i], fastmath=True)
+                                tTR_rS[i + 1] = cute.math.exp2(tTR_rS[i + 1], fastmath=True)
+
+                                if h_r_idx == 0:
+                                    tTR_rS_compute[i], tTR_rS_compute[i + 1] = cute.arch.add_packed_f32x2(
+                                        (0.0, 0.0),
+                                        (tTR_rS[i], tTR_rS[i + 1]),
+                                    )
+                                else:
+                                    tTR_rS_compute[i], tTR_rS_compute[i + 1] = cute.arch.add_packed_f32x2(
+                                        (tTR_rS_compute[i], tTR_rS_compute[i + 1]),
+                                        (tTR_rS[i], tTR_rS[i + 1]),
+                                    )
+
+                            cute.arch.fence_view_async_tmem_load()
+                            cute.arch.barrier(
+                                barrier_id=self.compute_sync_bar_id,
+                                number_of_threads=self.num_compute_warps * self.threads_per_warp,
+                            )
+
+                            # Copy tS_reduce back to tmem
+                            cute.copy(tiled_r2t, tTR_rS_compute, tRT_tS_compute_sub_tile)
+
+                            cute.arch.fence_view_async_tmem_store()
+
+                        load_compute_LSE_pipeline.consumer_release(load_compute_LSE_consumer_state)
+                        load_compute_LSE_consumer_state.advance()
+
+                        mma_compute_S_pipeline.consumer_release(mma_compute_S_consumer_state)
+                        mma_compute_S_consumer_state.advance()
+
+                    # Reduce
+                    cute.copy(tiled_t2r_reduce, tTR_tS_reduce, tTR_rS_reduce)
+
+                    # Contraint: self.mma_tiler[1] % self.num_elem_for_reduction == 0
+                    tTR_rS_reduce_reshape = cute.composition(
+                        tTR_rS_reduce,
+                        cute.make_layout(
+                            (
+                                self.mma_tiler[1] // self.num_elem_for_reduction,
+                                self.num_elem_for_reduction,
+                            ),
+                            stride=(self.num_elem_for_reduction, 1),
+                        ),
+                    )
+                    tTR_rS_reduce_vec = tTR_rS_reduce_reshape.load()
+                    tTR_rS_reduce_sum_vec = tTR_rS_reduce_vec.reduce(
+                        cute.ReductionOp.ADD,
+                        self.acc_dtype(0.0),
+                        reduction_profile=(None, 1),
+                    )
+
+                    tmp.store(tTR_rS_reduce_sum_vec)
+
+                    q_row_idx = bidx * self.mma_tiler[0] + thread_idx
+                    # TODO: make sure the self.mma_tile[1] // self.num_elem_for_reduction is always larger than self.k_value
+                    if compute_iter_index == k_tile_idx:
+                        self.topk_step(
+                            tmp,
+                            scores_heap_rf,
+                            idx_heap_rf,
+                            q_row_idx,
+                            compute_iter_index,
+                            self.k_value,
+                        )
+                    else:
+                        self.topk_step(
+                            tmp,
+                            scores_heap_rf,
+                            idx_heap_rf,
+                            q_row_idx,
+                            compute_iter_index,
+                            0,
+                        )
+
+                    compute_iter_count -= 1
+                    compute_iter_index += 1
+
+                # (s_q, k_value, (1, h_k, b))
+                gTopk_scores = cute.flat_divide(mTopk_scores, (self.epi_tile[0], self.k_value))
+                gTopk_indices = cute.flat_divide(mTopk_indices, (self.epi_tile[0], self.k_value))
+                gTopk_scores = gTopk_scores[None, None, bidx, 0, (0, bidy, bidz)]
+                gTopk_indices = gTopk_indices[None, None, bidx, 0, (0, bidy, bidz)]
+                cTopk = cute.make_identity_tensor((self.epi_tile[0], self.k_value))
+                cTopk = cute.domain_offset((bidx * self.epi_tile[0], 0), cTopk)
+
+                copy_atom = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(),
+                    self.acc_dtype,
+                    num_bits_per_copy=128,
+                )
+
+                thr_layout = cute.make_layout(
+                    (self.num_compute_warps * self.threads_per_warp, 1),
+                    stride=(1, 1),
+                )
+                val_layout = cute.make_layout(
+                    (1, self.k_value),
+                    stride=(self.k_value, 1),
+                )
+                copy_op = cute.make_tiled_copy_tv(
+                    copy_atom,
+                    thr_layout,
+                    val_layout,
+                )
+
+                thr_copy = copy_op.get_slice(thread_idx)
+
+                tgTopk_scores = thr_copy.partition_D(gTopk_scores)
+                tgTopk_indices = thr_copy.partition_D(gTopk_indices)
+                tcTopk = thr_copy.partition_D(cTopk)
+
+                pred_shape = (tcTopk.shape[0][1], tcTopk.shape[1], tcTopk.shape[2])
+                preds = cute.make_rmem_tensor(pred_shape, cutlass.Boolean)
+                for v in cutlass.range_constexpr(preds.shape[0]):
+                    for m in cutlass.range_constexpr(preds.shape[1]):
+                        for n in cutlass.range_constexpr(preds.shape[2]):
+                            lhs = tcTopk[v, m, n]
+                            val = cute.elem_less(lhs, (cur_s_q, self.k_value))
+                            preds[v, m, n] = val
+
+                cute.copy(copy_atom, scores_heap_rf, tgTopk_scores, pred=preds)
+                cute.copy(copy_atom, idx_heap_rf, tgTopk_indices, pred=preds)
+
+                cute.arch.barrier(
+                    barrier_id=self.compute_sync_bar_id,
+                    number_of_threads=self.num_compute_warps * self.threads_per_warp,
+                )
+
+                if warp_idx % self.num_compute_warps == 0:
+                    tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                        self.acc_dtype,
+                        alignment=16,
+                        ptr_to_buffer_holding_addr=storage.tmem_holding_buf,
+                    )
+                    cute.arch.dealloc_tmem(tmem_ptr, 512)
+
+    @cute.jit
+    def topk_step(
+        self,
+        tiled_scores: cute.Tensor,  # (reduced_tile) probabilities
+        scores_heap_rf: cute.Tensor,
+        idx_heap_rf: cute.Tensor,
+        query_index: Int32,
+        k_tile_idx: Int32,
+        heap_size: int,
+    ):
+
+        q_selection_block_idx = (query_index + 1) // self.selection_block_size
+        blocks_per_tile = self.mma_tiler[1] // self.num_elem_for_reduction
+
+        score = 0.0
+        gmem_idx = 0
+        is_valid = cutlass.Boolean(False)
+
+        for i in cutlass.range_constexpr(heap_size):
+            score = tiled_scores[i]
+            gmem_idx = i + k_tile_idx * blocks_per_tile
+
+            is_valid = (gmem_idx > 0) and (gmem_idx < q_selection_block_idx - 2)
+
+            val_score = score if is_valid else -cutlass.Float32.inf
+            val_index = gmem_idx if is_valid else -1
+            scores_heap_rf[i] = val_score
+            idx_heap_rf[i] = val_index
+
+        for i in cutlass.range(heap_size, cute.size(tiled_scores)):
+            score = tiled_scores[i]
+            gmem_idx = i + k_tile_idx * blocks_per_tile
+
+            # Decide if candidate is valid for consideration
+            is_valid = (gmem_idx > 0) and (gmem_idx < q_selection_block_idx - 2)
+
+            # Full: branchless min-scan to find current minimum (with tie-break)
+            min_score = scores_heap_rf[0]
+            min_index = idx_heap_rf[0]
+            for j in cutlass.range(1, self.k_value, unroll_full=True):
+                s = scores_heap_rf[j]
+                t = idx_heap_rf[j]
+                is_smaller_score = s < min_score
+                is_tie_break = (s == min_score) and (t > min_index)
+                if is_smaller_score or is_tie_break:
+                    # swap to make sure the min_slot is always 0
+                    scores_heap_rf[0], scores_heap_rf[j] = s, min_score
+                    idx_heap_rf[0], idx_heap_rf[j] = t, min_index
+                    min_score = s
+                    min_index = t
+
+            # Decide replacement under tie-break rules
+            if is_valid:
+                is_larger_score = score > min_score
+                is_tie_better = (score == min_score) and (gmem_idx < min_index)
+                if is_larger_score or is_tie_better:
+                    scores_heap_rf[0] = score
+                    idx_heap_rf[0] = gmem_idx
diff --git a/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/utils.py b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/utils.py
new file mode 100644
index 00000000..9ec421ba
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/native_sparse_attention/utils.py
@@ -0,0 +1,31 @@
+from typing import Tuple, Optional
+
+import torch
+
+
+def make_tensor_strided_like(
+    q_tensor: torch.Tensor,
+    o_shape: Tuple[int, ...],
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None,
+):
+    """
+    Create an empty tensor with the given shape that mimics the layout/strides of
+    the provided `q_tensor` as closely as possible.
+    """
+    q_strides = q_tensor.stride()
+    rank_out = len(o_shape)
+    order = tuple(sorted(range(min(len(q_strides), rank_out)), key=lambda i: q_strides[i]))
+
+    strides = [0] * rank_out
+    current = 1
+    for dim in order:
+        strides[dim] = current
+        current *= o_shape[dim]
+
+    return torch.empty_strided(
+        o_shape,
+        tuple(strides),
+        dtype=dtype if dtype is not None else q_tensor.dtype,
+        device=device if device is not None else q_tensor.device,
+    )
diff --git a/third_party/cudnn-frontend/python/cudnn/wrapper.py b/third_party/cudnn-frontend/python/cudnn/wrapper.py
new file mode 100644
index 00000000..6d1cfc2d
--- /dev/null
+++ b/third_party/cudnn-frontend/python/cudnn/wrapper.py
@@ -0,0 +1,626 @@
+"""Wrapper for cuDNN frontend to improve user experience.
+
+This wrapper provides a more user-friendly interface for cuDNN frontend.
+It allows users to create a graph, add operations to the graph, and then
+compile the graph to a cuDNN plan. This wrapper is designed to avoid
+boilerplate code.
+
+Key Features:
+    - Automatic graph validation and compilation
+    - Simplified tensor management
+    - Support for both named and positional tensor inputs
+    - Automatic workspace management
+    - PyTorch integration with DLPack support
+
+Example:
+    >>> x = torch.randn(8, 56, 56, 64, device=device, dtype=torch.float16).permute(0, 3, 1, 2)
+    >>> w = torch.randn(32, 3, 3, 64, device=device, dtype=torch.float16).permute(0, 3, 1, 2)
+    >>> with Graph() as graph:
+    ...     y = graph.conv_fprop(
+    ...         image=x, weight=w,
+    ...         padding=[1,1], stride=[1,1], dilation=[1,1],
+    ...         compute_data_type=data_type.FLOAT,
+    ...         name="conv2d",
+    ...     )
+    ...     y.set_output(True).set_data_type(data_type.HALF)
+    ...     # Graph is automatically validated and compiled on exit
+    >>> graph.set_io_tuples(["conv2d::image", "conv2d::weight"], ["conv2d::Y"])
+    >>> # Execute the graph
+    >>> output = graph(x, w)
+"""
+
+from collections import OrderedDict
+import atexit
+import itertools
+import inspect
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import cudnn
+import cudnn.datatypes
+from cudnn import data_type, heur_mode
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+__all__ = ["Graph", "data_type", "heur_mode", "cudnn"]
+
+# typedefs for readability
+CudnnHandle = int
+_default_cudnn_handle = None
+
+# Configure logging
+logger = logging.getLogger(__name__)
+
+
+def _graph_tensor(graph: cudnn.pygraph, tensor: "torch.Tensor") -> cudnn.tensor:
+    """Create a tensor in the graph object.
+
+    Args:
+        graph: The cuDNN graph object to create the tensor in
+        tensor: The dlpack tensor to create a graph tensor from
+
+    Returns:
+        A cuDNN tensor object representing the input tensor in the graph
+
+    Note:
+        If the input tensor has requires_grad=True, it will be detached
+        before creating the graph tensor to avoid gradient tracking issues.
+    """
+    if hasattr(tensor, "requires_grad") and tensor.requires_grad:
+        # PyTorch tensor with requires_grad=True need to be detached first
+        return graph.tensor_like(tensor.detach())
+    else:
+        return graph.tensor_like(tensor)
+
+
+def _find_tensor(
+    tensor: Union[str, cudnn.tensor, "torch.Tensor"],
+    tensor_map: Dict[str, cudnn.tensor],
+    dlpack_map: Dict[int, cudnn.tensor],
+) -> str:
+    """Find the mapping name for a tensor used in a graph.
+
+    This function searches for a tensor in the tensor map and returns its
+    corresponding name. The tensor can be specified in multiple ways:
+    - As a string (either the assigned tensor name or the node::input_name)
+    - As a cuDNN tensor object
+    - As a DLPack-compatible tensor (e.g., PyTorch tensor) that was used in creating the graph
+
+    Args:
+        tensor: The tensor to find, can be a string name, cuDNN tensor, or DLPack tensor
+        tensor_map: Dictionary mapping tensor names to cuDNN tensor objects
+        dlpack_map: Dictionary mapping DLPack tensor IDs to cuDNN tensor objects
+
+    Returns:
+        The key in tensor_map that the provided tensor is mapped to
+
+    Raises:
+        ValueError: If the tensor cannot be found in the tensor map
+    """
+    if isinstance(tensor, str):
+        # look up by canonical name, then assigned name
+        if tensor in tensor_map:
+            return tensor  # this name is "node::input_name"
+        for tensor_name, tensor_value in tensor_map.items():
+            if tensor_value.get_name() == tensor:
+                return tensor_name  # name is the assigned name of the tensor
+    elif isinstance(tensor, int):
+        # look up by tensor uid
+        for tensor_name, tensor_value in tensor_map.items():
+            if tensor_value.get_uid() == tensor:
+                return tensor_name
+    elif isinstance(tensor, cudnn.tensor):
+        for tensor_name, tensor_value in tensor_map.items():
+            if tensor is tensor_value:
+                return tensor_name
+    elif hasattr(tensor, "__dlpack__") and isinstance(dlpack_map, dict) and id(tensor) in dlpack_map:
+        tensor = dlpack_map[id(tensor)]
+        for tensor_name, tensor_value in tensor_map.items():
+            if tensor_value == tensor:
+                return tensor_name
+    raise ValueError("Input not found in tensor map")
+
+
+def _extract_tensor(name: str, tensor: cudnn.tensor, arg_dict: dict) -> Optional["torch.Tensor"]:
+    """Extract a dlpack tensor from the arg_dict that matches the provided name or cudnn tensor
+
+    Args:
+        name: The name of the tensor to extract
+        tensor: The cudnn tensor to extract
+        arg_dict: The dictionary of arguments to extract the tensor from
+
+    Returns:
+        A dlpack tensor
+    """
+    if name in arg_dict:
+        return arg_dict[name]  # match by canonical name
+    if tensor in arg_dict:
+        return arg_dict[tensor]  # match by cudnn tensor object
+    try:
+        return arg_dict[tensor.get_name()]  # match by assigned name
+    except KeyError:
+        pass
+    try:
+        return arg_dict[tensor.get_uid()]  # match by tensor uid
+    except KeyError:
+        return None  # not found
+
+
+def _tensor_like(cudnn_tensor: cudnn.tensor, tensor_type: str = "pyt") -> "torch.Tensor":
+    """Create a tensor like the provided cudnn tensor
+
+    Args:
+        cudnn_tensor: The cuDNN tensor to create a dlpack tensor from
+        tensor_type: The type of tensor to create, currently only "pyt" is supported
+
+    Returns:
+        A dlpack tensor allocated that is like the provided cuDNN tensor
+    """
+    if tensor_type != "pyt":
+        raise NotImplementedError("Only PyTorch tensor is supported for now")
+    if not cudnn.datatypes.is_torch_available():
+        raise RuntimeError("PyTorch is not available")
+    dtype = cudnn.datatypes._cudnn_to_torch_data_type(cudnn_tensor.get_data_type())
+    if dtype is None:
+        raise TypeError(f"cuDNN uses an unsupported data type in PyTorch: {cudnn_tensor.get_data_type()}")
+    tensor = torch.empty(cudnn_tensor.get_dim(), device="cuda", dtype=dtype)
+    tensor = torch.as_strided(tensor, cudnn_tensor.get_dim(), cudnn_tensor.get_stride())
+    return tensor
+
+
+def get_default_handle(stream: Optional["torch.cuda.Stream"] = None) -> CudnnHandle:
+    """Get the default cuDNN handle and set to torch's current stream"""
+    global _default_cudnn_handle
+    if torch is None:
+        raise RuntimeError("PyTorch is not available")
+    if _default_cudnn_handle is None:
+        _default_cudnn_handle = cudnn.create_handle()
+    if stream is None:
+        stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=_default_cudnn_handle, stream=stream)
+    return _default_cudnn_handle
+
+
+def destroy_default_handle():
+    if _default_cudnn_handle is not None:
+        cudnn.destroy_handle(_default_cudnn_handle)
+
+
+atexit.register(destroy_default_handle)
+
+
+class Graph:
+    """Wrapper object for cuDNN computation graph
+
+    This class simplifies the process of creating, compiling, and executing
+    cuDNN computation graphs. It handles common boilerplate code and provides
+    a more Pythonic interface to the cuDNN frontend API.
+
+    Key features:
+    - Automatic graph validation and compilation
+    - Simplified tensor management with PyTorch integration
+    - Support for both named and positional tensor inputs
+    - Automatic workspace management
+
+    Note:
+        The graph is automatically validated and compiled when exiting the
+        context manager. Any errors in graph construction will be raised
+        at that point.
+    """
+
+    __handle: Optional[CudnnHandle] = None  # holding the cudnn handle pointer
+
+    def __init__(
+        self,
+        *,
+        handle: Optional[CudnnHandle] = None,
+        inputs: Optional[List[Union[str, "torch.Tensor", cudnn.tensor]]] = None,
+        outputs: Optional[List[Union[str, "torch.Tensor", cudnn.tensor]]] = None,
+        heuristics: Optional[List[heur_mode]] = None,
+        workspace_alloc: bool = True,
+        **kwargs,
+    ) -> None:
+        if torch is None:
+            raise RuntimeError("PyTorch is not available")
+        if inputs and not isinstance(inputs, (list, tuple)):
+            raise ValueError("inputs must be a list or tuple")
+        if outputs and not isinstance(outputs, (list, tuple)):
+            raise ValueError("outputs must be a list or tuple")
+        if heuristics and not isinstance(heuristics, (list, tuple)):
+            raise ValueError("heuristics must be a list or tuple")
+        if cudnn.backend_version() < 91200:
+            raise RuntimeError("cuDNN version 9.12.0 or higher is required")
+        self.__kwargs = kwargs
+        self.__graph = None  # to hold the cudnn.pygraph object
+        self.__tensor_map = {}  # obj id of dlpack tensor -> cudnn tensor
+        self.__tensor_in = OrderedDict()  # canonical node::argname -> cudnn tensors used as the input
+        self.__tensor_out = OrderedDict()  # canonical node::outname -> cudnn tensors produced by the node
+        self.__tensor_unknown = []  # list of cuDNN tensors created by user directly
+        self.__node_count = {}  # function name of graph node -> number of times used
+        self.__node_names = set()  # set of assigned names of graph nodes, to check name collision
+        self.__input_tuples = None  # tuple of input tensors, if set by set_io_tuples
+        self.__output_tuples = None  # tuple of output tensors, if set by set_io_tuples
+        self.__inputs = inputs or []  # hold the list of inputs, to be used by set_io_tuples() implicitly
+        self.__outputs = outputs or []  # hold the list of outputs, to be used by set_io_tuples() implicitly
+        self.__heuristics = heuristics or [heur_mode.A, heur_mode.FALLBACK]
+        if not workspace_alloc:
+            self.__workspace = False
+        if handle:
+            self.__handle = handle
+        # silently replace the PyTorch dtype into cuDNN dtype
+        for key in ["io_data_type", "intermediate_data_type", "compute_data_type"]:
+            if key in kwargs:
+                kwargs[key] = cudnn.datatypes._torch_to_cudnn_data_type(kwargs[key]) or kwargs[key]
+
+    def __del__(self):
+        pass
+
+    def __enter__(self):
+        if self.__graph is not None:
+            raise RuntimeError("Graph already created")
+        self.__graph = cudnn.pygraph(
+            # Pass handle only if self.__handle is not None
+            **({"handle": self.__handle} if self.__handle not in ["auto", None] else {}),
+            **self.__kwargs,
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_value, tb):
+        """Exit the context manager, validating and compiling the graph.
+
+        This method performs the following steps:
+        1. Validates the graph structure
+        2. Builds the operation graph
+        3. Creates execution plans
+        4. Checks hardware support
+        5. Builds the final plans
+        6. Allocates workspace memory
+
+        Raises:
+            ValidationError: If graph validation fails
+            GraphStateError: If graph operations are performed in invalid order
+            CudnnError: For other cuDNN-related errors
+        """
+        # if there is an exception, clean up and propagate
+        if exc_type is not None:
+            logger.error("Exception during graph construction: %s", exc_value)
+            self.__graph = None
+            raise
+        # prepare the graph and build plans: Each should return None or raise exception
+        self.__graph.validate()
+        self.__graph.build_operation_graph()
+        self.__graph.create_execution_plans(self.__heuristics)
+        # TODO: let user select_behavior_notes() and select_numeric_notes() here
+        self.__graph.check_support()
+        self.__graph.build_plans()
+        # Set up workspace if not forbidden by user, then set up I/O tensor orders
+        if not hasattr(self, "__workspace"):
+            self.__workspace = torch.empty(
+                self.__graph.get_workspace_size(),
+                device="cuda",
+                dtype=torch.uint8,
+            )
+        if self.__inputs or self.__outputs:
+            self.set_io_tuples(self.__inputs, self.__outputs)
+        del self.__inputs, self.__outputs
+
+        logger.debug("Inputs: %s", self.__tensor_in)
+        logger.debug("Outputs: %s", self.__tensor_out)
+        logger.debug("Node count: %s", self.__node_count)
+        return self.__graph
+
+    def __getattr__(self, name: str) -> Any:
+        attr = getattr(self.__graph, name)
+        # calling tensor_like is unnecessary, just pass through
+        pass_through = [
+            "get_workspace_size",
+            "get_workspace_size_plan_at_index",
+            "serialize",
+            "deserialize",
+            "query_tensor_attributes_of_uid",
+        ]
+        if name in pass_through:
+            return attr
+        # some methods are blocked and should not be called via wrapper
+        # TODO should allow user select execution plan
+        blocked_methods = [
+            "build",
+            "build_operation_graph",
+            "build_plan_at_index",
+            "build_plans",
+            "check_support",
+            "create_execution_plan",
+            "create_execution_plans",
+            "deselect_behavior_notes",
+            "deselect_engines",
+            "deselect_numeric_notes",
+            "deselect_workspace_greater_than",
+            "execute",
+            "execute_plan_at_index",
+            "get_behavior_notes",
+            "get_behavior_notes_for_plan_at_index",
+            "get_engine_count",
+            "get_execution_plan_count",
+            "get_knobs_for_engine",
+            "get_plan_name_at_index",
+            "key",
+            "populate_cuda_graph",
+            "query_tensor_attributes_of_uid",
+            "select_behavior_notes",
+            "select_numeric_notes",
+            "update_cuda_graph",
+            "validate",
+        ]
+
+        if name in blocked_methods:
+            raise RuntimeError(f"Calling {name} via wrapper is not allowed")
+        # non-methods: pass through. Probably not used but be safe
+        if not inspect.ismethod(attr):
+            return attr
+
+        # tensor creation methods: capture the output
+        def tensor_capture(*args, **kwargs):
+            output = attr(*args, **kwargs)
+            self.__tensor_unknown.append(output)
+            return output
+
+        if name in ["tensor", "tensor_like"]:
+            return tensor_capture
+
+        # other methods: wrap the method to intercept the arguments and return values
+        def wrapper(*args, **kwargs):
+            args = list(args)  # shallow copy, to allow in-place modification
+            # determine the name of the graph node, the node may carry name attribute
+            if name not in self.__node_count:
+                self.__node_count[name] = 0
+            self.__node_count[name] += 1
+            if "name" in kwargs:
+                node_name = kwargs["name"]
+            else:
+                node_name = f"{name}.{self.__node_count[name]-1}"
+                kwargs["name"] = node_name
+            if node_name in self.__node_names:
+                raise ValueError(f"Node name {node_name} already used")
+            self.__node_names.add(node_name)
+            # process positional arguments for dlpack tensors
+            for i, obj in enumerate(args):
+                if hasattr(obj, "__dlpack__"):
+                    obj_id = id(obj)
+                    if obj_id not in self.__tensor_map:
+                        self.__tensor_map[obj_id] = _graph_tensor(self.__graph, obj)
+                    obj = args[i] = self.__tensor_map[obj_id]
+                if isinstance(obj, cudnn.tensor):
+                    self.__tensor_in[f"{node_name}::{i}"] = obj
+            # process keyword arguments for dlpack tensors
+            for key, obj in kwargs.items():
+                if hasattr(obj, "__dlpack__"):
+                    obj_id = id(obj)
+                    if obj_id not in self.__tensor_map:
+                        self.__tensor_map[obj_id] = _graph_tensor(self.__graph, obj)
+                    obj = kwargs[key] = self.__tensor_map[obj_id]
+                if isinstance(obj, cudnn.tensor):
+                    self.__tensor_in[f"{node_name}::{key}"] = obj
+            # capturing node output
+            output = attr(*args, **kwargs)
+            if isinstance(output, cudnn.tensor):
+                output_list = [output]
+            elif isinstance(output, (list, tuple)):
+                output_list = output
+            for i, obj in enumerate(output_list):
+                if isinstance(obj, cudnn.tensor):
+                    if hasattr(obj, "get_name") and obj.get_name():
+                        tensor_name = obj.get_name()
+                    else:
+                        tensor_name = f"{node_name}::{i}"
+                    self.__tensor_out[tensor_name] = obj
+            return output
+
+        return wrapper
+
+    def __call__(self, *args, **kwargs):
+        """Execute the graph with tensor dict"""
+        if self.__graph is None:
+            raise RuntimeError("Graph not created")
+        if not self.__graph.get_execution_plan_count():
+            raise RuntimeError("You should not invoke the graph before the context exits")
+        if len(args) == 1 and isinstance(args[0], dict):
+            return self.__call_with_tensor_dict(args[0], **kwargs)
+        else:
+            if len(args) > 0 and not self.__input_tuples:
+                raise ValueError("You should not invoke the graph with positional arguments before running set_io_tuples()")
+            if len(args) != len(self.__input_tuples):
+                raise ValueError(f"Number of arguments ({len(args)}) does not match number of inputs ({len(self.__input_tuples)})")
+            return self.__call_with_positional_args(*args, **kwargs)
+
+    def __call_with_positional_args(self, *args, **kwargs) -> Union["torch.Tensor", Tuple["torch.Tensor", ...]]:
+        """Execute the graph with positional arguments.
+
+        Args:
+            *args: Positional arguments to pass to the graph
+            **kwargs: Additional keyword arguments to pass to the graph execution
+
+        Returns:
+            A single tensor or a tuple of tensors
+
+        Note:
+            This method is called by __call__() when the graph is executed with positional arguments.
+            It is not intended to be called directly by the user. The `args` should be a list of dlpack tensors
+            that matches the input order of `self.__input_tuples`.
+        """
+        # prepare the variant pack:
+        # all non-virtual tensors in __tensor_in and __tensor_out should be filled
+        variant_pack = {}
+        for cudnn_tensor, user_tensor in zip(self.__input_tuples, args):
+            variant_pack[cudnn_tensor.get_uid()] = user_tensor
+        output_tuple = [_tensor_like(cudnn_tensor, "pyt") for cudnn_tensor in self.__output_tuples]
+        for cudnn_tensor, user_tensor in zip(self.__output_tuples, output_tuple):
+            variant_pack[cudnn_tensor.get_uid()] = user_tensor
+        # execute the graph
+        kwargs = dict(kwargs)  # shallow copy
+
+        if "handle" not in kwargs:
+            if self.__handle == "auto":
+                kwargs["handle"] = get_default_handle()
+            elif self.__handle is not None:
+                kwargs["handle"] = self.__handle
+            else:
+                raise RuntimeError("Need to specify cudnn handle to execute graph")
+        if "workspace" not in kwargs:
+            if self.__workspace is False:
+                raise RuntimeError("Need to specify workspace to execute graph")
+            else:
+                kwargs["workspace"] = self.__workspace
+        self.__graph.execute(variant_pack, **kwargs)
+        # return the output as a single tensor or a tuple
+        if len(output_tuple) == 1:
+            return output_tuple[0]
+        else:
+            return output_tuple
+
+    def __call_with_tensor_dict(
+        self,
+        tensor_dict: Dict[str, "torch.Tensor"],
+        **kwargs,
+    ) -> Dict[str, "torch.Tensor"]:
+        """Execute the graph with a dictionary of tensors.
+
+        Args:
+            tensor_dict: Dictionary of tensor names to tensors
+            **kwargs: Additional keyword arguments to pass to the graph execution
+
+        Returns:
+            Dictionary of tensor names to tensors
+
+        Raises:
+            RuntimeError: If a non-virtual tensor in the graph is not found in
+            `tensor_dict`, or the tensor in `tensor_dict` is not a dlpack tensor
+        """
+        # Notes
+        """
+        from arg tensor_dict -> variant_pack
+        also from self.__tensor_in + self.__tensor_out -> variant_pack
+        both: check if all non-virtual tensors are filled
+        """
+        # prepare the variant pack:
+        # all non-virtual tensors in __tensor_in and __tensor_out should be filled
+        variant_pack = {}
+        missing_tensors = {}
+        for name, tensor in itertools.chain(self.__tensor_in.items(), self.__tensor_out.items()):
+            if tensor.get_uid() in variant_pack or tensor.get_is_virtual():
+                continue  # already filled or not needed
+            user_tensor = _extract_tensor(name, tensor, tensor_dict)
+            if user_tensor is None:
+                missing_tensors[tensor] = name  # overwriting existing entries
+                continue
+            if not hasattr(user_tensor, "__dlpack__"):
+                raise RuntimeError(f"Tensor {name} is not provided as a dlpack tensor")
+            variant_pack[tensor.get_uid()] = user_tensor
+        # check if all non-virtual tensors are filled
+        missing_inputs = []
+        missing_outputs = []
+        for tensor, name in missing_tensors.items():
+            if tensor.get_uid() in variant_pack:
+                continue  # already filled
+            if name in self.__tensor_out:
+                # output tensor not specified, should be created automatically
+                variant_pack[tensor.get_uid()] = tensor_dict[name] = _tensor_like(tensor, "pyt")
+                missing_outputs.append(name)
+            else:
+                # input tensor not specified, flag it as missing
+                missing_inputs.append(name)
+        if missing_inputs:
+            raise RuntimeError(f"Non-virtual input tensors not found in variant pack: {missing_inputs}")
+        if missing_outputs:
+            logger.debug("Added output tensors: %s", missing_outputs)
+        # execute the graph
+        kwargs = dict(kwargs)  # shallow copy
+        if "handle" not in kwargs:
+            if self.__handle == "auto":
+                kwargs["handle"] = get_default_handle()
+            elif self.__handle is not None:
+                kwargs["handle"] = self.__handle
+            else:
+                raise RuntimeError("Need to specify cudnn handle to execute graph")
+        if "workspace" not in kwargs:
+            if self.__workspace is False:
+                raise RuntimeError("Need to specify workspace to execute graph")
+            else:
+                kwargs["workspace"] = self.__workspace
+        self.__graph.execute(variant_pack, **kwargs)
+        return tensor_dict  # by this time, the output tensors are updated
+
+    def set_io_tuples(
+        self,
+        inputs: List[Union[str, "torch.Tensor", cudnn.tensor]],
+        outputs: List[Union[str, "torch.Tensor", cudnn.tensor]],
+    ) -> None:
+        """Set order of input and output tensors to allow graph to be executed with positional arguments.
+
+        Args:
+            inputs: List of input tensors or names
+            outputs: List of output tensors or names
+
+        Raises:
+            ValueError: If inputs or outputs are not lists or tuples
+        """
+        if not isinstance(inputs, (list, tuple)):
+            raise ValueError("inputs must be a list or tuple")
+        if not isinstance(outputs, (list, tuple)):
+            raise ValueError("outputs must be a list or tuple")
+        if not self.__graph.get_execution_plan_count():
+            # raise RuntimeError("You should not invoke set_io_tuples() before the context exits")
+            self.__inputs = inputs
+            self.__outputs = outputs
+            return
+
+        # self.__tensor_out and self.__tensor_in are dict of str -> cudnn tensor
+        # all non-virtual tensors should be either input or output
+
+        # Convert "inputs" to a list of names that can be looked up in __tensor_in
+        input_tensors = []
+        tensors_found = set()
+        for i, name in enumerate(inputs):
+            try:
+                if name in self.__tensor_unknown:
+                    input_tensors.append(name)  # user-created cuDNN tensor object
+                    continue
+                name = _find_tensor(name, self.__tensor_in, self.__tensor_map)
+                tensor = self.__tensor_in[name]
+                if id(tensor) in tensors_found:
+                    raise ValueError(f"Input at index {i} ({name}) is a duplicate")
+                tensors_found.add(id(tensor))
+                input_tensors.append(tensor)
+            except ValueError:
+                raise ValueError(f"Input at index {i} ({name}) not found in tensor map") from None
+        # Convert "outputs" to a list of names that can be looked up in __tensor_out
+        output_tensors = []
+        for i, name in enumerate(outputs):
+            try:
+                if name in self.__tensor_unknown:
+                    output_tensors.append(name)  # user-created cuDNN tensor object
+                    continue
+                name = _find_tensor(name, self.__tensor_out, self.__tensor_map)
+                tensor = self.__tensor_out[name]
+                if id(tensor) in tensors_found:
+                    raise ValueError(f"Output at index {i} ({name}) is a duplicate")
+                tensors_found.add(id(tensor))
+                output_tensors.append(tensor)
+            except ValueError:
+                raise ValueError(f"Output at index {i} ({name}) not found in tensor map") from None
+        # Verify that all input tensors are non-virtual
+        for i, tensor in enumerate(input_tensors):
+            if tensor.get_is_virtual():
+                raise ValueError(f"Input at index {i} is a virtual tensor")
+        # Verify that all non-virtual tensors are covered by input or output
+        for name, tensor in self.__tensor_out.items():
+            if not tensor.get_is_virtual() and tensor not in output_tensors:
+                raise ValueError(f"Node output {name} is a non-virtual tensor but not specified as output")
+        for name, tensor in self.__tensor_in.items():
+            if not tensor.get_is_virtual() and id(tensor) not in tensors_found:
+                raise ValueError(f"Node input {name} is a non-virtual tensor but not specified as input or output")
+        # Set the input and output names
+        self.__input_tuples = tuple(input_tensors)
+        self.__output_tuples = tuple(output_tensors)
diff --git a/third_party/cudnn-frontend/python/properties.cpp b/third_party/cudnn-frontend/python/properties.cpp
new file mode 100644
index 00000000..6dc513d1
--- /dev/null
+++ b/third_party/cudnn-frontend/python/properties.cpp
@@ -0,0 +1,344 @@
+#include <utility>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+#include "pybind11/complex.h"
+#include "pybind11/functional.h"
+
+#include "cudnn_frontend.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend {
+
+namespace python_bindings {
+
+void
+throw_if(bool const cond, cudnn_frontend::error_code_t const error_code, std::string const& error_msg);
+
+class HandleManagement {
+   public:
+    static std::intptr_t
+    create_handle() {
+        cudnnHandle_t handle;
+        auto status = detail::create_handle(&handle);
+        throw_if(
+            status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnHandle Create failed");
+        return reinterpret_cast<std::intptr_t>(handle);
+    }
+
+    static void
+    destroy_handle(std::intptr_t handle) {
+        auto status = detail::destroy_handle((cudnnHandle_t)handle);
+        throw_if(
+            status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnHandle Destroy failed");
+    }
+
+    static void
+    set_stream(std::intptr_t handle, std::intptr_t stream) {
+        auto status = detail::set_stream((cudnnHandle_t)handle, (cudaStream_t)stream);
+        throw_if(status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnSetStream failed");
+    }
+
+    static std::intptr_t
+    get_stream(std::intptr_t handle) {
+        cudaStream_t streamId = nullptr;
+        auto status           = detail::get_stream((cudnnHandle_t)handle, &streamId);
+        throw_if(status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnGetStream failed");
+
+        return reinterpret_cast<std::intptr_t>(streamId);
+    }
+};
+
+std::shared_ptr<cudnn_frontend::KernelCache>
+create_kernel_cache_helper() {
+    auto kernel_cache = std::make_shared<cudnn_frontend::KernelCache>();
+    throw_if(kernel_cache == nullptr, cudnn_frontend::error_code_t::INVALID_VALUE, "kernel cache creation failed");
+    return kernel_cache;
+}
+
+std::string
+kernel_cache_to_json_helper(std::shared_ptr<cudnn_frontend::KernelCache> const& kernel_cache) {
+    std::string str_json;
+    auto err = kernel_cache->to_json(str_json);
+    throw_if(err.is_bad(), err.code, err.get_message());
+    return str_json;
+}
+
+void
+kernel_cache_from_json_helper(std::shared_ptr<cudnn_frontend::KernelCache> kernel_cache, std::string const& str_json) {
+    auto err = kernel_cache->from_json(str_json);
+    throw_if(err.is_bad(), err.code, err.get_message());
+}
+
+std::shared_ptr<cudnn_frontend::DeviceProperties>
+create_device_properties_helper(int32_t device_id) {
+    auto device_properties = std::make_shared<cudnn_frontend::DeviceProperties>();
+    throw_if(
+        device_properties == nullptr, cudnn_frontend::error_code_t::INVALID_VALUE, "device properties creation failed");
+    if (device_id >= 0) {
+        auto err = device_properties->set_device_id(device_id).build();
+        throw_if(!err.is_good(), err.code, err.get_message());
+    }
+    return device_properties;
+}
+
+std::shared_ptr<cudnn_frontend::DeviceProperties>
+create_device_properties_helper(std::string const& json_str) {
+    auto device_properties = std::make_shared<cudnn_frontend::DeviceProperties>();
+    throw_if(
+        device_properties == nullptr, cudnn_frontend::error_code_t::INVALID_VALUE, "device properties creation failed");
+    std::vector<uint8_t> serialization_buf(json_str.begin(), json_str.end());
+    auto err = device_properties->deserialize(serialization_buf);
+    throw_if(err.is_bad(), err.code, err.get_message());
+    return device_properties;
+}
+
+std::string
+serialize_device_properties_helper(std::shared_ptr<cudnn_frontend::DeviceProperties> const& device_properties) {
+    std::vector<uint8_t> serialization_buf;
+    auto err = device_properties->serialize(serialization_buf);
+    throw_if(err.is_bad(), err.code, err.get_message());
+    return std::string(serialization_buf.begin(), serialization_buf.end());
+}
+
+static std::string
+get_last_error_string() {
+    return detail::get_last_error_string_();
+}
+
+void
+init_properties(py::module_& m) {
+    py::enum_<cudnn_frontend::DataType_t>(m, "data_type")
+        .value("FLOAT", cudnn_frontend::DataType_t::FLOAT)
+        .value("DOUBLE", cudnn_frontend::DataType_t::DOUBLE)
+        .value("HALF", cudnn_frontend::DataType_t::HALF)
+        .value("INT8", cudnn_frontend::DataType_t::INT8)
+        .value("INT32", cudnn_frontend::DataType_t::INT32)
+        .value("INT8x4", cudnn_frontend::DataType_t::INT8x4)
+        .value("UINT8", cudnn_frontend::DataType_t::UINT8)
+        .value("UINT8x4", cudnn_frontend::DataType_t::UINT8x4)
+        .value("INT8x32", cudnn_frontend::DataType_t::INT8x32)
+        .value("BFLOAT16", cudnn_frontend::DataType_t::BFLOAT16)
+        .value("INT64", cudnn_frontend::DataType_t::INT64)
+        .value("BOOLEAN", cudnn_frontend::DataType_t::BOOLEAN)
+        .value("FP8_E4M3", cudnn_frontend::DataType_t::FP8_E4M3)
+        .value("FP8_E5M2", cudnn_frontend::DataType_t::FP8_E5M2)
+        .value("FAST_FLOAT_FOR_FP8", cudnn_frontend::DataType_t::FAST_FLOAT_FOR_FP8)
+        .value("FP8_E8M0", cudnn_frontend::DataType_t::FP8_E8M0)
+        .value("FP4_E2M1", cudnn_frontend::DataType_t::FP4_E2M1)
+        .value("INT4", cudnn_frontend::DataType_t::INT4)
+        .value("NOT_SET", cudnn_frontend::DataType_t::NOT_SET);
+
+    py::enum_<cudnn_frontend::TensorReordering_t>(m, "tensor_reordering")
+        .value("NONE", cudnn_frontend::TensorReordering_t::NONE)
+        .value("INT8x32", cudnn_frontend::TensorReordering_t::INT8x32)
+        .value("F16x16", cudnn_frontend::TensorReordering_t::F16x16)
+        .value("F8_128x4", cudnn_frontend::TensorReordering_t::F8_128x4);
+
+    py::class_<cudnn_frontend::graph::Tensor_attributes, std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>(
+        m, "tensor")
+        .def(py::init<>())
+        .def("get_name", &cudnn_frontend::graph::Tensor_attributes::get_name)
+        .def("set_name", &cudnn_frontend::graph::Tensor_attributes::set_name)
+        .def("get_data_type", &cudnn_frontend::graph::Tensor_attributes::get_data_type)
+        .def("_set_data_type", &cudnn_frontend::graph::Tensor_attributes::set_data_type)
+        .def("get_dim", &cudnn_frontend::graph::Tensor_attributes::get_dim)
+        .def("set_dim", &cudnn_frontend::graph::Tensor_attributes::set_dim)
+        .def("get_stride", &cudnn_frontend::graph::Tensor_attributes::get_stride)
+        .def("set_stride", &cudnn_frontend::graph::Tensor_attributes::set_stride)
+        .def("get_is_virtual", &cudnn_frontend::graph::Tensor_attributes::get_is_virtual)
+        .def("set_is_virtual", &cudnn_frontend::graph::Tensor_attributes::set_is_virtual)
+        .def(
+            "set_output",
+            [](cudnn_frontend::graph::Tensor_attributes& self,
+               bool const is_output) -> cudnn_frontend::graph::Tensor_attributes& {
+                self.set_is_virtual(!is_output);
+                return self;
+            },
+            py::return_value_policy::reference)  // NOTICE THATS ITS JUST ANOTHER NAME FOR SET_IS_VIRTUAL
+        .def("get_is_pass_by_value", &cudnn_frontend::graph::Tensor_attributes::get_is_pass_by_value)
+        .def("set_is_pass_by_value", &cudnn_frontend::graph::Tensor_attributes::set_is_pass_by_value)
+        .def("get_uid", &cudnn_frontend::graph::Tensor_attributes::get_uid)
+        .def("set_uid", &cudnn_frontend::graph::Tensor_attributes::set_uid)
+        .def("get_reordering_type", &cudnn_frontend::graph::Tensor_attributes::get_reordering_type)
+        .def("set_reordering_type",
+             &cudnn_frontend::graph::Tensor_attributes::set_reordering_type,
+             py::return_value_policy::reference)
+        .def("get_alignment", &cudnn_frontend::graph::Tensor_attributes::get_alignment)
+        .def("set_alignment",
+             &cudnn_frontend::graph::Tensor_attributes::set_alignment,
+             py::return_value_policy::reference)
+        .def("get_vector_count", &cudnn_frontend::graph::Tensor_attributes::get_vector_count)
+        .def("get_vector_dimension", &cudnn_frontend::graph::Tensor_attributes::get_vector_dimension)
+        .def("set_vector_count_and_dimension",
+             &cudnn_frontend::graph::Tensor_attributes::set_vector_count_and_dimension,
+             py::return_value_policy::reference)
+        .def("set_ragged_offset", &cudnn_frontend::graph::Tensor_attributes::set_ragged_offset)
+        .def("__repr__", [](cudnn_frontend::graph::Tensor_attributes const& props) {
+            std::ostringstream out;
+            out << json{props};
+            return out.str();
+        });
+
+    py::enum_<cudnn_frontend::KnobType_t>(m, "knob_type")
+        .value("NOT_SET", cudnn_frontend::KnobType_t::NOT_SET)
+        .value("SWIZZLE", cudnn_frontend::KnobType_t::SWIZZLE)
+        .value("TILE_SIZE", cudnn_frontend::KnobType_t::TILE_SIZE)
+        .value("EDGE", cudnn_frontend::KnobType_t::EDGE)
+        .value("MULTIPLY", cudnn_frontend::KnobType_t::MULTIPLY)
+        .value("SPLIT_K_BUF", cudnn_frontend::KnobType_t::SPLIT_K_BUF)
+        .value("TILEK", cudnn_frontend::KnobType_t::TILEK)
+        .value("STAGES", cudnn_frontend::KnobType_t::STAGES)
+        .value("REDUCTION_MODE", cudnn_frontend::KnobType_t::REDUCTION_MODE)
+        .value("SPLIT_K_SLC", cudnn_frontend::KnobType_t::SPLIT_K_SLC)
+        .value("IDX_MODE", cudnn_frontend::KnobType_t::IDX_MODE)
+        .value("SPECFILT", cudnn_frontend::KnobType_t::SPECFILT)
+        .value("KERNEL_CFG", cudnn_frontend::KnobType_t::KERNEL_CFG)
+        .value("WORKSPACE", cudnn_frontend::KnobType_t::WORKSPACE)
+        .value("TILE_CGA_M", cudnn_frontend::KnobType_t::TILE_CGA_M)
+        .value("TILE_CGA_N", cudnn_frontend::KnobType_t::TILE_CGA_N)
+        .value("BLOCK_SIZE", cudnn_frontend::KnobType_t::BLOCK_SIZE)
+        .value("OCCUPANCY", cudnn_frontend::KnobType_t::OCCUPANCY)
+        .value("ARRAY_SIZE_PER_THREAD", cudnn_frontend::KnobType_t::ARRAY_SIZE_PER_THREAD)
+        .value("SPLIT_COLS", cudnn_frontend::KnobType_t::SPLIT_COLS)
+        .value("TILE_ROWS", cudnn_frontend::KnobType_t::TILE_ROWS)
+        .value("TILE_COLS", cudnn_frontend::KnobType_t::TILE_COLS)
+        .value("LOAD_SIZE", cudnn_frontend::KnobType_t::LOAD_SIZE)
+        .value("CTA_COUNT", cudnn_frontend::KnobType_t::CTA_COUNT)
+        .value("STREAM_K", cudnn_frontend::KnobType_t::STREAM_K)
+        .value("SPLIT_P_SLC", cudnn_frontend::KnobType_t::SPLIT_P_SLC)
+        .value("TILE_M", cudnn_frontend::KnobType_t::TILE_M)
+        .value("TILE_N", cudnn_frontend::KnobType_t::TILE_N)
+        .value("WARP_SPEC_CFG", cudnn_frontend::KnobType_t::WARP_SPEC_CFG);
+
+    py::class_<cudnn_frontend::Knob, std::shared_ptr<cudnn_frontend::Knob>>(m, "knob")
+        .def(py::init<cudnn_frontend::KnobType_t, int64_t, int64_t, int64_t>(),
+             py::arg_v("knob_type", cudnn_frontend::KnobType_t::NOT_SET),
+             py::arg_v("max_value", py::none()),
+             py::arg_v("min_value", py::none()),
+             py::arg_v("stride", py::none()))
+        .def_readonly("type", &cudnn_frontend::Knob::type)
+        .def_readonly("max_value", &cudnn_frontend::Knob::maxValue)
+        .def_readonly("min_value", &cudnn_frontend::Knob::minValue)
+        .def_readonly("stride", &cudnn_frontend::Knob::stride)
+        .def("__repr__", [](cudnn_frontend::Knob const& knob) {
+            std::stringstream ss;
+            json j;
+            j["knob_type"] = knob.type;
+            j["max_value"] = knob.maxValue;
+            j["min_value"] = knob.minValue;
+            j["stride"]    = knob.stride;
+            ss << j.dump();
+            return ss.str();
+        });
+
+    m.def("get_last_error_string", &get_last_error_string);
+
+    py::class_<cudnn_frontend::KernelCache, std::shared_ptr<cudnn_frontend::KernelCache>>(m, "kernel_cache")
+        .def("serialize", &kernel_cache_to_json_helper)
+        .def("deserialize", &kernel_cache_from_json_helper);
+    m.def("create_kernel_cache", &create_kernel_cache_helper);
+
+    py::class_<cudnn_frontend::DeviceProperties, std::shared_ptr<cudnn_frontend::DeviceProperties>>(m,
+                                                                                                    "device_properties")
+        .def("serialize", &serialize_device_properties_helper);
+    m.def(
+        "create_device_properties",
+        static_cast<std::shared_ptr<cudnn_frontend::DeviceProperties> (*)(int32_t)>(&create_device_properties_helper));
+    m.def("create_device_properties",
+          static_cast<std::shared_ptr<cudnn_frontend::DeviceProperties> (*)(std::string const&)>(
+              &create_device_properties_helper));
+
+    m.def("create_handle", &HandleManagement::create_handle);
+    m.def("destroy_handle", &HandleManagement::destroy_handle);
+    m.def("get_stream", &HandleManagement::get_stream);
+    m.def("set_stream", &HandleManagement::set_stream, py::arg("handle"), py::arg("stream"));
+
+    py::enum_<cudnn_frontend::NormFwdPhase_t>(m, "norm_forward_phase")
+        .value("INFERENCE", cudnn_frontend::NormFwdPhase_t::INFERENCE)
+        .value("TRAINING", cudnn_frontend::NormFwdPhase_t::TRAINING)
+        .value("NOT_SET", cudnn_frontend::NormFwdPhase_t::NOT_SET);
+
+    py::enum_<cudnn_frontend::HeurMode_t>(m, "heur_mode")
+        .value("A", cudnn_frontend::HeurMode_t::A)
+        .value("B", cudnn_frontend::HeurMode_t::B)
+        .value("FALLBACK", cudnn_frontend::HeurMode_t::FALLBACK);
+
+    py::enum_<cudnn_frontend::ConvolutionMode_t>(m, "convolution_mode")
+        .value("CONVOLUTION", cudnn_frontend::ConvolutionMode_t::CONVOLUTION)
+        .value("CROSS_CORRELATION", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION);
+
+    py::enum_<cudnn_frontend::ReductionMode_t>(m, "reduction_mode")
+        .value("ADD", cudnn_frontend::ReductionMode_t::ADD)
+        .value("MUL", cudnn_frontend::ReductionMode_t::MUL)
+        .value("MIN", cudnn_frontend::ReductionMode_t::MIN)
+        .value("MAX", cudnn_frontend::ReductionMode_t::MAX)
+        .value("AMAX", cudnn_frontend::ReductionMode_t::AMAX)
+        .value("AVG", cudnn_frontend::ReductionMode_t::AVG)
+        .value("NORM1", cudnn_frontend::ReductionMode_t::NORM1)
+        .value("NORM2", cudnn_frontend::ReductionMode_t::NORM2)
+        .value("MUL_NO_ZEROS", cudnn_frontend::ReductionMode_t::MUL_NO_ZEROS)
+        .value("NOT_SET", cudnn_frontend::ReductionMode_t::NOT_SET);
+
+    py::enum_<cudnn_frontend::BuildPlanPolicy_t>(m, "build_plan_policy")
+        .value("HEURISTICS_CHOICE", cudnn_frontend::BuildPlanPolicy_t::HEURISTICS_CHOICE)
+        .value("ALL", cudnn_frontend::BuildPlanPolicy_t::ALL);
+
+    py::enum_<cudnn_frontend::NumericalNote_t>(m, "numerical_note")
+        .value("TENSOR_CORE", cudnn_frontend::NumericalNote_t::TENSOR_CORE)
+        .value("DOWN_CONVERT_INPUTS", cudnn_frontend::NumericalNote_t::DOWN_CONVERT_INPUTS)
+        .value("REDUCED_PRECISION_REDUCTION", cudnn_frontend::NumericalNote_t::REDUCED_PRECISION_REDUCTION)
+        .value("FFT", cudnn_frontend::NumericalNote_t::FFT)
+        .value("NONDETERMINISTIC", cudnn_frontend::NumericalNote_t::NONDETERMINISTIC)
+        .value("WINOGRAD", cudnn_frontend::NumericalNote_t::WINOGRAD)
+        .value("WINOGRAD_TILE_4x4", cudnn_frontend::NumericalNote_t::WINOGRAD_TILE_4x4)
+        .value("WINOGRAD_TILE_6x6", cudnn_frontend::NumericalNote_t::WINOGRAD_TILE_6x6)
+        .value("WINOGRAD_TILE_13x13", cudnn_frontend::NumericalNote_t::WINOGRAD_TILE_13x13)
+        .value("STRICT_NAN_PROP", cudnn_frontend::NumericalNote_t::STRICT_NAN_PROP);
+
+    py::enum_<cudnn_frontend::BehaviorNote_t>(m, "behavior_note")
+        .value("RUNTIME_COMPILATION", cudnn_frontend::BehaviorNote_t::RUNTIME_COMPILATION)
+        .value("REQUIRES_FILTER_INT8x32_REORDER", cudnn_frontend::BehaviorNote_t::REQUIRES_FILTER_INT8x32_REORDER)
+        .value("REQUIRES_BIAS_INT8x32_REORDER", cudnn_frontend::BehaviorNote_t::REQUIRES_BIAS_INT8x32_REORDER)
+        .value("SUPPORTS_CUDA_GRAPH_NATIVE_API", cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API)
+        .value("CUBLASLT_DEPENDENCY", cudnn_frontend::BehaviorNote_t::CUBLASLT_DEPENDENCY);
+
+    py::enum_<cudnn_frontend::DiagonalAlignment_t>(m, "diagonal_alignment")
+        .value("TOP_LEFT", cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+        .value("BOTTOM_RIGHT", cudnn_frontend::DiagonalAlignment_t::BOTTOM_RIGHT);
+
+    py::enum_<cudnn_frontend::AttentionImplementation_t>(m, "attention_implementation")
+        .value("AUTO", cudnn_frontend::AttentionImplementation_t::AUTO)
+        .value("COMPOSITE", cudnn_frontend::AttentionImplementation_t::COMPOSITE)
+        .value("UNIFIED", cudnn_frontend::AttentionImplementation_t::UNIFIED);
+
+    py::enum_<cudnn_frontend::MoeGroupedMatmulMode_t>(m, "moe_grouped_matmul_mode")
+        .value("NONE", cudnn_frontend::MoeGroupedMatmulMode_t::NONE)
+        .value("GATHER", cudnn_frontend::MoeGroupedMatmulMode_t::GATHER)
+        .value("SCATTER", cudnn_frontend::MoeGroupedMatmulMode_t::SCATTER);
+}
+
+}  // namespace python_bindings
+}  // namespace cudnn_frontend
+
+// namespace pybind11 {
+//     namespace detail {
+//     template <> struct type_caster<std::shared_ptr<cudnn_frontend::KernelCache>> {
+//     public:
+//         PYBIND11_TYPE_CASTER(std::shared_ptr<cudnn_frontend::KernelCache>, _("KernelCachePtr"));
+
+//         bool load(handle , bool) {
+//             return false; // Prevent Python -> C++ conversion
+//         }
+
+//         static handle cast(std::shared_ptr<cudnn_frontend::KernelCache> src, return_value_policy, handle) {
+//             if (!src) return none().release();
+//             return capsule(new std::shared_ptr<cudnn_frontend::KernelCache>(std::move(src)),
+//                            [](void *ptr) { delete static_cast<std::shared_ptr<cudnn_frontend::KernelCache>*>(ptr);
+//                            }).release();
+//         }
+//     };
+// }} // namespace pybind11::detail
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/python/pycudnn.cpp b/third_party/cudnn-frontend/python/pycudnn.cpp
new file mode 100644
index 00000000..ed647af2
--- /dev/null
+++ b/third_party/cudnn-frontend/python/pycudnn.cpp
@@ -0,0 +1,96 @@
+#include <utility>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+
+#include "cudnn_frontend.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend {
+
+#ifdef _WIN32
+HMODULE cudnn_dlhandle = nullptr;
+#else
+void *cudnn_dlhandle = nullptr;
+#endif
+
+namespace python_bindings {
+
+// Raise C++ exceptions corresponding to C++ FE error codes.
+// Pybinds will automatically convert C++ exceptions to python exceptions.
+void
+throw_if(bool const cond, cudnn_frontend::error_code_t const error_code, std::string const &error_msg) {
+    if (cond == false) return;
+
+    switch (error_code) {
+        case cudnn_frontend::error_code_t::OK:
+            return;
+        case cudnn_frontend::error_code_t::ATTRIBUTE_NOT_SET:
+            throw std::invalid_argument(error_msg);
+        case cudnn_frontend::error_code_t::SHAPE_DEDUCTION_FAILED:
+            throw std::invalid_argument(error_msg);
+        case cudnn_frontend::error_code_t::INVALID_TENSOR_NAME:
+            throw std::invalid_argument(error_msg);
+        case cudnn_frontend::error_code_t::INVALID_VARIANT_PACK:
+            throw std::invalid_argument(error_msg);
+        case cudnn_frontend::error_code_t::GRAPH_EXECUTION_PLAN_CREATION_FAILED:
+            throw cudnn_frontend::cudnnGraphNotSupportedException(error_msg.c_str());
+        case cudnn_frontend::error_code_t::GRAPH_EXECUTION_FAILED:
+            throw std::runtime_error(error_msg);
+        case cudnn_frontend::error_code_t::HEURISTIC_QUERY_FAILED:
+            throw cudnn_frontend::cudnnGraphNotSupportedException(error_msg.c_str());
+        case cudnn_frontend::error_code_t::CUDNN_BACKEND_API_FAILED:
+            throw std::runtime_error(error_msg);
+        case cudnn_frontend::error_code_t::CUDA_API_FAILED:
+            throw std::runtime_error(error_msg);
+        case cudnn_frontend::error_code_t::INVALID_CUDA_DEVICE:
+            throw std::runtime_error(error_msg);
+        case cudnn_frontend::error_code_t::UNSUPPORTED_GRAPH_FORMAT:
+            throw cudnn_frontend::cudnnGraphNotSupportedException(error_msg.c_str());
+        case cudnn_frontend::error_code_t::GRAPH_NOT_SUPPORTED:
+            throw cudnn_frontend::cudnnGraphNotSupportedException(error_msg.c_str());
+        case cudnn_frontend::error_code_t::HANDLE_ERROR:
+            throw std::runtime_error(error_msg);
+        case cudnn_frontend::error_code_t::INVALID_VALUE:
+            throw std::runtime_error(error_msg);
+    }
+}
+
+// pybinds for pygraph class
+void
+init_pygraph_submodule(py::module_ &);
+
+// pybinds for kernel_cache class
+void
+create_kernel_cache_submodule(py::module_ &);
+
+// pybinds for all properties and helpers
+void
+init_properties(py::module_ &);
+
+void
+set_dlhandle_cudnn(std::intptr_t dlhandle) {
+#ifdef _WIN32
+    cudnn_dlhandle = reinterpret_cast<HMODULE>(dlhandle);
+#else
+    cudnn_dlhandle = reinterpret_cast<void *>(dlhandle);
+#endif
+}
+
+PYBIND11_MODULE(_compiled_module, m) {
+    m.def("backend_version", &detail::get_backend_version);
+    m.def("backend_version_string", &detail::get_backend_version_string);
+
+    init_properties(m);
+    init_pygraph_submodule(m);
+
+    m.def("_set_dlhandle_cudnn", &set_dlhandle_cudnn);
+
+    py::register_exception<cudnnGraphNotSupportedException>(m, "cudnnGraphNotSupportedError");
+}
+
+}  // namespace python_bindings
+}  // namespace cudnn_frontend
diff --git a/third_party/cudnn-frontend/python/pygraph/norm.cpp b/third_party/cudnn-frontend/python/pygraph/norm.cpp
new file mode 100644
index 00000000..77ceeb34
--- /dev/null
+++ b/third_party/cudnn-frontend/python/pygraph/norm.cpp
@@ -0,0 +1,325 @@
+#include <utility>
+#include <vector>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+
+#include "cudnn_frontend.h"
+#include "pygraph.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend {
+
+namespace python_bindings {
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::batchnorm(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& in_running_mean,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& in_running_var,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& momentum,
+                   std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>& peer_stats,
+                   cudnn_frontend::DataType_t const& compute_data_type,
+                   std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Batchnorm_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_epsilon(epsilon)
+                          .set_previous_running_stats(in_running_mean, in_running_var, momentum)
+                          .set_peer_stats(peer_stats)
+                          .set_name(name);
+
+    auto [Y, mean, inv_var, next_running_mean, next_running_var] = graph->batchnorm(x, scale, bias, attributes);
+    return {Y, mean, inv_var, next_running_mean, next_running_var};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::layernorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                   cudnn_frontend::DataType_t const& compute_data_type,
+                   std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Layernorm_attributes()
+                          .set_forward_phase(forward_phase)
+                          .set_compute_data_type(compute_data_type)
+                          .set_epsilon(epsilon)
+                          .set_name(name);
+
+    auto [Y, mean, inv_var] = graph->layernorm(x, scale, bias, attributes);
+    return {Y, mean, inv_var};
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::batchnorm_inference(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& mean,
+                             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& inv_variance,
+                             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                             cudnn_frontend::DataType_t const& compute_data_type,
+                             std::string const& name) {
+    auto attributes =
+        cudnn_frontend::graph::Batchnorm_inference_attributes().set_compute_data_type(compute_data_type).set_name(name);
+
+    return graph->batchnorm_inference(x, mean, inv_variance, scale, bias, attributes);
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::layernorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                            cudnn_frontend::DataType_t const& compute_data_type,
+                            std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Layernorm_backward_attributes()
+                          .set_saved_mean_and_inv_variance(mean, inv_variance)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto [DX, DScale, DBias] = graph->layernorm_backward(dy, x, scale, attributes);
+    return {DX, DScale, DBias};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::batchnorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                            std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>& peer_stats,
+                            cudnn_frontend::DataType_t const& compute_data_type,
+                            std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Batchnorm_backward_attributes()
+                          .set_saved_mean_and_inv_variance(mean, inv_variance)
+                          .set_peer_stats(peer_stats)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto [DX, DScale, DBias] = graph->batchnorm_backward(dy, x, scale, attributes);
+    return {DX, DScale, DBias};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::adalayernorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                      cudnn_frontend::DataType_t const& compute_data_type,
+                      std::string const& name) {
+    auto attributes = cudnn_frontend::graph::AdaLayernorm_attributes()
+                          .set_forward_phase(forward_phase)
+                          .set_compute_data_type(compute_data_type)
+                          .set_epsilon(epsilon)
+                          .set_name(name);
+
+    auto [Y, mean, inv_var] = graph->adalayernorm(x, scale, bias, attributes);
+    return {std::move(Y), std::move(mean), std::move(inv_var)};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::adalayernorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                               cudnn_frontend::DataType_t const& compute_data_type,
+                               std::string const& name) {
+    auto attributes = cudnn_frontend::graph::AdaLayernorm_backward_attributes()
+                          .set_saved_mean_and_inv_variance(mean, inv_variance)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto [DX, DScale, DBias] = graph->adalayernorm_backward(dy, x, scale, attributes);
+    return {std::move(DX), std::move(DScale), std::move(DBias)};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::rmsnorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                 cudnn_frontend::DataType_t const& compute_data_type,
+                 std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Rmsnorm_attributes()
+                          .set_forward_phase(forward_phase)
+                          .set_compute_data_type(compute_data_type)
+                          .set_bias(bias)
+                          .set_epsilon(epsilon)
+                          .set_name(name);
+
+    auto [Y, inv_var] = graph->rmsnorm(x, scale, attributes);
+    return {Y, inv_var};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::rmsnorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                          bool const has_dbias,
+                          cudnn_frontend::DataType_t const& compute_data_type,
+                          std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Rmsnorm_backward_attributes()
+                          .has_dbias(has_dbias)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto [DX, DScale, DBias] = graph->rmsnorm_backward(dy, x, scale, inv_variance, attributes);
+    return {DX, DScale, DBias};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::instancenorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                      cudnn_frontend::DataType_t const& compute_data_type,
+                      std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Instancenorm_attributes()
+                          .set_forward_phase(forward_phase)
+                          .set_compute_data_type(compute_data_type)
+                          .set_epsilon(epsilon)
+                          .set_name(name);
+
+    auto [Y, mean, inv_var] = graph->instancenorm(x, scale, bias, attributes);
+    return {Y, mean, inv_var};
+}
+
+std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+PyGraph::instancenorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                               cudnn_frontend::DataType_t const& compute_data_type,
+                               std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Instancenorm_backward_attributes()
+                          .set_saved_mean_and_inv_variance(mean, inv_variance)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto [DX, DScale, DBias] = graph->instancenorm_backward(dy, x, scale, attributes);
+    return {DX, DScale, DBias};
+}
+
+void
+init_pygraph_norm_submodule(py::class_<PyGraph>& m) {
+    m.def("batchnorm",
+          &PyGraph::batchnorm,
+          py::arg("input"),
+          py::arg("scale"),
+          py::arg("bias"),
+          py::arg("in_running_mean"),
+          py::arg("in_running_var"),
+          py::arg("epsilon"),
+          py::arg("momentum"),
+          py::arg_v("peer_stats", std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""))
+        .def("layernorm",
+             &PyGraph::layernorm,
+             py::arg("norm_forward_phase"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg("bias"),
+             py::arg("epsilon"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("adalayernorm",
+             &PyGraph::adalayernorm,
+             py::arg("norm_forward_phase"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg_v("bias", nullptr),
+             py::arg("epsilon"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("batchnorm_inference",
+             &PyGraph::batchnorm_inference,
+             py::arg("input"),
+             py::arg("mean"),
+             py::arg("inv_variance"),
+             py::arg("scale"),
+             py::arg("bias"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("batchnorm_backward",
+             &PyGraph::batchnorm_backward,
+             py::arg("grad"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg("mean"),
+             py::arg("inv_variance"),
+             py::arg_v("peer_stats", std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>()),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("layernorm_backward",
+             &PyGraph::layernorm_backward,
+             py::arg("grad"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg("mean"),
+             py::arg("inv_variance"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("adalayernorm_backward",
+             &PyGraph::adalayernorm_backward,
+             py::arg("grad"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg("mean"),
+             py::arg("inv_variance"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("rmsnorm",
+             &PyGraph::rmsnorm,
+             py::arg("norm_forward_phase"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg_v("bias", nullptr),
+             py::arg("epsilon"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("rmsnorm_backward",
+             &PyGraph::rmsnorm_backward,
+             py::arg("grad"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg("inv_variance"),
+             py::arg("has_dbias"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+
+        .def("instancenorm",
+             &PyGraph::instancenorm,
+             py::arg("norm_forward_phase"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg("bias"),
+             py::arg("epsilon"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+
+        .def("instancenorm_backward",
+             &PyGraph::instancenorm_backward,
+             py::arg("grad"),
+             py::arg("input"),
+             py::arg("scale"),
+             py::arg_v("mean", nullptr),
+             py::arg_v("inv_variance", nullptr),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""));
+}
+
+}  // namespace python_bindings
+
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/python/pygraph/pointwise.cpp b/third_party/cudnn-frontend/python/pygraph/pointwise.cpp
new file mode 100644
index 00000000..ea9cf7b5
--- /dev/null
+++ b/third_party/cudnn-frontend/python/pygraph/pointwise.cpp
@@ -0,0 +1,1139 @@
+#include <utility>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+
+#include "cudnn_frontend.h"
+#include "pygraph.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend::python_bindings {
+
+template <cudnn_frontend::PointwiseMode_t MODE>
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::pointwise_ternary(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& a,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& b,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& c,
+                           cudnn_frontend::DataType_t const& compute_data_type,
+                           std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_mode(MODE)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+    return graph->pointwise(a, b, c, attributes);
+}
+
+template <cudnn_frontend::PointwiseMode_t MODE>
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::pointwise_binary(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& a,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& b,
+                          cudnn_frontend::DataType_t const& compute_data_type,
+                          std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_mode(MODE)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+    return graph->pointwise(a, b, attributes);
+}
+
+template <cudnn_frontend::PointwiseMode_t MODE>
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::pointwise_unary(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& a,
+                         cudnn_frontend::DataType_t const& compute_data_type,
+                         std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_mode(MODE)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+    return graph->pointwise(a, attributes);
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::relu(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+              std::optional<float> const& negative_slope,
+              std::optional<float> const& lower_clip,
+              std::optional<float> const& upper_clip,
+              cudnn_frontend::DataType_t const& compute_data_type,
+              std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_mode(cudnn_frontend::PointwiseMode_t::RELU_FWD)
+                          .set_name(name);
+
+    if (negative_slope.has_value()) {
+        attributes.set_relu_lower_clip_slope(negative_slope.value());
+    }
+
+    if (lower_clip.has_value()) {
+        attributes.set_relu_lower_clip(lower_clip.value());
+    }
+
+    if (upper_clip.has_value()) {
+        attributes.set_relu_upper_clip(upper_clip.value());
+    }
+
+    auto OUT_0 = graph->pointwise(input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::swish(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::optional<float> const& swish_beta,
+               std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_mode(cudnn_frontend::PointwiseMode_t::SWISH_FWD)
+                          .set_name(name);
+
+    if (swish_beta.has_value()) {
+        attributes.set_swish_beta(swish_beta.value());
+    }
+
+    auto OUT_0 = graph->pointwise(input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::swish_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                        std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                        cudnn_frontend::DataType_t const& compute_data_type,
+                        std::optional<float> const& swish_beta,
+                        std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_mode(cudnn_frontend::PointwiseMode_t::SWISH_BWD)
+                          .set_name(name);
+
+    if (swish_beta.has_value()) {
+        attributes.set_swish_beta(swish_beta.value());
+    }
+
+    auto OUT_0 = graph->pointwise(loss, input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::gen_index(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                   int64_t const axis,
+                   cudnn_frontend::DataType_t const& compute_data_type,
+                   std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_mode(cudnn_frontend::PointwiseMode_t::GEN_INDEX)
+                          .set_axis(axis)
+                          .set_name(name);
+
+    auto OUT_0 = graph->pointwise(input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::relu_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                       std::optional<float> const& negative_slope,
+                       std::optional<float> const& lower_clip,
+                       std::optional<float> const& upper_clip,
+                       cudnn_frontend::DataType_t const& compute_data_type,
+                       std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Pointwise_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_mode(cudnn_frontend::PointwiseMode_t::RELU_BWD)
+                          .set_name(name);
+
+    if (negative_slope.has_value()) {
+        attributes.set_relu_lower_clip_slope(negative_slope.value());
+    }
+
+    if (lower_clip.has_value()) {
+        attributes.set_relu_lower_clip(lower_clip.value());
+    }
+
+    if (upper_clip.has_value()) {
+        attributes.set_relu_upper_clip(upper_clip.value());
+    }
+
+    auto OUT_0 = graph->pointwise(loss, input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::leaky_relu_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                             float const negative_slope,
+                             cudnn_frontend::DataType_t const& compute_data_type,
+                             std::string const& name) {
+    return relu_backward(loss, input, negative_slope, std::nullopt, std::nullopt, compute_data_type, name);
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::leaky_relu(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                    float const negative_slope,
+                    cudnn_frontend::DataType_t const& compute_data_type,
+                    std::string const& name) {
+    return relu(input, negative_slope, std::nullopt, std::nullopt, compute_data_type, name);
+}
+
+void
+init_pygraph_pointwise_submodule(py::class_<PyGraph>& m) {
+    m.def("add",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::ADD>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Adds two cudnn tensors.
+
+            Args:
+                a (cudnn_tensor): The first tensor.
+                b (cudnn_tensor): The second tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of addition operation.
+        )pbdoc");
+    m.def("bias",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::ADD>,
+          py::arg("input"),
+          py::arg("bias"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Add bias to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                bias (cudnn_tensor): The bias tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of adding bias to the input.
+        )pbdoc");
+    m.def("mul",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::MUL>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Computes elementwise multiplication of two cudnn tensors.
+
+        Args:
+            a (cudnn_tensor): The first tensor.
+            b (cudnn_tensor): The second tensor.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of the elementwise multiplication operation.
+            )pbdoc");
+    m.def("scale",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::MUL>,
+          py::arg("input"),
+          py::arg("scale"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Scale the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                scale (cudnn_tensor): The scale tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the scaling operation.
+        )pbdoc");
+
+    m.def("sqrt",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::SQRT>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Square root of the input tensor is computed
+
+        Args:
+            input (cudnn_tensor): The input tensor.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: pointwise square root of the input tensor is computed
+        )pbdoc");
+
+    m.def("max",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::MAX>,
+          py::arg("input0"),
+          py::arg("input1"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Max of the input tensors is computed
+
+        Args:
+            input (cudnn_tensor): The input tensor 0.
+            input (cudnn_tensor): The input tensor 1.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: a pointwise maximum is taken between two tensors.
+        )pbdoc");
+    m.def("min",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::MIN>,
+          py::arg("input0"),
+          py::arg("input1"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Max of the input tensors is computed
+
+        Args:
+            input (cudnn_tensor): The input tensor 0.
+            input (cudnn_tensor): The input tensor 1.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: a pointwise minimum is taken between two tensors.
+        )pbdoc");
+
+    m.def("gen_index",
+          &PyGraph::gen_index,
+          py::arg("input"),
+          py::arg("axis"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Generates pointwise index value of the input tensor is generated along a given axis.
+
+        Args:
+            input (cudnn_tensor): The input tensor.
+            axis (int): The axis to generate index for.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result tensor containing the indices
+        )pbdoc");
+
+    // forward activations
+    m.def("relu",
+          &PyGraph::relu,
+          py::arg("input"),
+          py::arg_v("negative_slope", py::none()),
+          py::arg_v("lower_clip", py::none()),
+          py::arg_v("upper_clip", py::none()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Apply the Rectified Linear Unit (ReLU) activation function to the input.
+
+        Args:
+            input (cudnn_tensor): The input tensor.
+            negative_slope (Optional[float]): Sets the lower clip slope value for ReLU.
+            lower_clip (Optional[float]): Sets the lower clip value for ReLU.
+            upper_clip (Optional[float]): Sets the upper clip value for ReLU.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of the ReLU activation.
+        )pbdoc");
+    m.def("leaky_relu",
+          &PyGraph::leaky_relu,
+          py::arg("input"),
+          py::arg("negative_slope"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Apply the Leaky Rectified Linear Unit (Leaky ReLU) activation function to the input.
+
+        Args:
+            input (cudnn_tensor): The input tensor.
+            negative_slope (float): The slope of the activation for negative inputs.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of the Leaky ReLU activation.
+        )pbdoc");
+    m.def("tanh",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::TANH_FWD>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        tanh activation of the input tensors is computed
+
+        Args:
+            input (cudnn_tensor): The input tensor.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: Result of tanh activation
+        )pbdoc");
+    m.def("elu",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::ELU_FWD>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Exponential Linear Unit (ELU) activation function to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the ELU activation.
+        )pbdoc");
+    m.def("gelu",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::GELU_FWD>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Gaussian Error Linear Unit (GELU) activation function to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the GELU activation.
+        )pbdoc");
+    m.def("sigmoid",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::SIGMOID_FWD>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the sigmoid activation function to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the sigmoid activation.
+        )pbdoc");
+    m.def("swish",
+          &PyGraph::swish,
+          py::arg("input"),
+          py::arg_v("swish_beta", py::none()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Swish activation function to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the Swish activation.
+        )pbdoc");
+    m.def("softplus",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::SOFTPLUS_FWD>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Softplus activation function to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the Softplus activation.
+        )pbdoc");
+    m.def("gelu_approx_tanh",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::GELU_APPROX_TANH_FWD>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Approximate GELU activation function to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the Approximate GELU activation.
+        )pbdoc");
+    // End of forward activations
+
+    // Backward activations
+    m.def("relu_backward",
+          &PyGraph::relu_backward,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("negative_slope", py::none()),
+          py::arg_v("lower_clip", py::none()),
+          py::arg_v("upper_clip", py::none()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on Rectified Linear Unit (ReLU) activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                negative_slope (Optional[float]): Sets the lower clip slope value for ReLU.
+                lower_clip (Optional[float]): Sets the lower clip value for ReLU.
+                upper_clip (Optional[float]): Sets the upper clip value for ReLU.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of ReLU activation.
+        )pbdoc");
+    m.def("leaky_relu_backward",
+          &PyGraph::leaky_relu_backward,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg("negative_slope"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on Leaky Rectified Linear Unit (Leaky ReLU) activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                negative_slope (float): The slope of the activation for negative inputs.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of Leaky ReLU activation.
+        )pbdoc");
+    m.def("tanh_backward",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::TANH_BWD>,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on tanh activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of tanh activation.
+        )pbdoc");
+    m.def("sigmoid_backward",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::SIGMOID_BWD>,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on sigmoid activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of sigmoid activation.
+        )pbdoc");
+    m.def("elu_backward",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::ELU_BWD>,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on elu activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of elu activation.
+        )pbdoc");
+    m.def("gelu_backward",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::GELU_BWD>,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on gelu activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of gelu activation.
+        )pbdoc");
+    m.def("softplus_backward",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::SOFTPLUS_BWD>,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on softplus activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of softplus activation.
+        )pbdoc");
+    m.def("swish_backward",
+          &PyGraph::swish_backward,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("swish_beta", py::none()),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on swish activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of swish activation.
+        )pbdoc");
+    m.def("gelu_approx_tanh_backward",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::GELU_APPROX_TANH_BWD>,
+          py::arg("loss"),
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply backpropagation on approximate gelu activation function.
+
+            Args:
+                loss (cudnn_tensor): The loss tensor.
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of backpropagation of approximate gelu activation.
+        )pbdoc");
+    // End of backward activation functions
+    m.def("erf",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::ERF>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute erf of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of erf of input.
+        )pbdoc");
+    m.def("identity",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::IDENTITY>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Copy input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The copy of input.
+        )pbdoc");
+
+    m.def("exp",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::EXP>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute exponential of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of exponential of input.
+        )pbdoc");
+    m.def("log",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::LOG>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute natural logarithm of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of natural logarithm of input.
+        )pbdoc");
+    m.def("neg",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::NEG>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute numerical negative of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of numerical sign negation of input.
+        )pbdoc");
+    m.def("mod",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::MOD>,
+          py::arg("input0"),
+          py::arg("input1"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            In this mode, a pointwise floating-point remainder of the first tensor's division by the second tensor is computed.
+
+            Args:
+                input0 (cudnn_tensor): The input tensor.
+                input1 (cudnn_tensor): The divisor tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of pointwise floating-point remainder of the input0 tensor's division by the input1 tensor
+        )pbdoc");
+    m.def("pow",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::POW>,
+          py::arg("input0"),
+          py::arg("input1"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            In this mode, a pointwise value from the first tensor to the power of the second tensor is computed.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of first tensor to the power of the second tensor.
+        )pbdoc");
+    m.def("abs",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::ABS>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Absolute value of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of absolute value of input.
+        )pbdoc");
+    m.def("ceil",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::CEIL>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            A pointwise ceiling of the input tensor is computed.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of ceil of input.
+        )pbdoc");
+    m.def("floor",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::FLOOR>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute floor of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of floor of input.
+        )pbdoc");
+    m.def("rsqrt",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::RSQRT>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute reciprocal square root of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of reciprocal square root of input.
+        )pbdoc");
+    m.def("reciprocal",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::RECIPROCAL>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute reciprocal input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of reciprocal of input.
+        )pbdoc");
+    m.def("sin",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::SIN>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute Sine of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of sine of input.
+        )pbdoc");
+    m.def("cos",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::COS>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute Cosine of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of cosine of input.
+        )pbdoc");
+    m.def("tan",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::TAN>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Compute Tangent of input tensor.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of tangent of input.
+        )pbdoc");
+    m.def("logical_not",
+          &PyGraph::pointwise_unary<cudnn_frontend::PointwiseMode_t::LOGICAL_NOT>,
+          py::arg("input"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Compute logical_not of input tensor.
+
+        Args:
+            input (cudnn_tensor): The input tensor.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of logical_not of input.
+    )pbdoc");
+    m.def("logical_and",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::LOGICAL_AND>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Computes logical and of two tensors.
+
+        Args:
+            a (cudnn_tensor): The tensor to subtract from.
+            b (cudnn_tensor): The tensor to subtract with.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of logical and between two tensors.
+    )pbdoc");
+    m.def("logical_or",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::LOGICAL_OR>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Computes logical or of two tensors.
+
+        Args:
+            a (cudnn_tensor): The tensor to subtract from.
+            b (cudnn_tensor): The tensor to subtract with.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of logical or between two tensors.
+    )pbdoc");
+
+    m.def("sub",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::SUB>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+        Computes subtraction of two tensors.
+
+        Args:
+            a (cudnn_tensor): The tensor to subtract from.
+            b (cudnn_tensor): The tensor to subtract with.
+            compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+            name (Optional[str]): A name for the operation to be performed.
+
+        Returns:
+            cudnn_tensor: The result of subtration.
+    )pbdoc");
+    m.def("div",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::DIV>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Computes Division of two tensors.
+
+            Args:
+                a (cudnn_tensor): The tensor to subtract from.
+                b (cudnn_tensor): The tensor to subtract with.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of Division.
+        )pbdoc");
+    m.def("add_square",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::ADD_SQUARE>,
+          py::arg("a"),
+          py::arg("b"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            a pointwise addition between the first tensor and the square of the second tensor is computed.
+
+            Args:
+                a (cudnn_tensor): The tensor to subtract from.
+                b (cudnn_tensor): The tensor to subtract with.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of a pointwise addition between the first tensor and the square of the second tensor is computed.
+        )pbdoc");
+
+    m.def("cmp_eq",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::CMP_EQ>,
+          py::arg("input"),
+          py::arg("comparison"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Compare Equal to Comparison to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                comparison (cudnn_tensor): The comparison tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+    m.def("cmp_neq",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::CMP_NEQ>,
+          py::arg("input"),
+          py::arg("comparison"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Compare Not equal to Comparison to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                comparison (cudnn_tensor): The comparison tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+    m.def("cmp_gt",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::CMP_GT>,
+          py::arg("input"),
+          py::arg("comparison"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Compare Greater Than Comparison to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                comparison (cudnn_tensor): The comparison tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+    m.def("cmp_ge",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::CMP_GE>,
+          py::arg("input"),
+          py::arg("comparison"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Compare Greater Than or Equal Comparison to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                comparison (cudnn_tensor): The comparison tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+    m.def("cmp_lt",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::CMP_LT>,
+          py::arg("input"),
+          py::arg("comparison"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Compare Lesser Than Comparison to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                comparison (cudnn_tensor): The comparison tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+    m.def("cmp_le",
+          &PyGraph::pointwise_binary<cudnn_frontend::PointwiseMode_t::CMP_LE>,
+          py::arg("input"),
+          py::arg("comparison"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Apply the Compare Lesser Than or Equal Comparison to the input.
+
+            Args:
+                input (cudnn_tensor): The input tensor.
+                comparison (cudnn_tensor): The comparison tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+    m.def("binary_select",
+          &PyGraph::pointwise_ternary<cudnn_frontend::PointwiseMode_t::BINARY_SELECT>,
+          py::arg("input0"),
+          py::arg("input1"),
+          py::arg("mask"),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+            Selects between input0 or input1 based on the mask
+
+            Args:
+                input0 (cudnn_tensor): The input tensor0.
+                input1 (cudnn_tensor): The input tensor1.
+                mask (cudnn_tensor): The mask tensor.
+                compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                name (Optional[str]): A name for the operation to be performed.
+
+            Returns:
+                cudnn_tensor: The result of the comparison.
+        )pbdoc");
+}
+
+}  // namespace cudnn_frontend::python_bindings
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/python/pygraph/pygraph.cpp b/third_party/cudnn-frontend/python/pygraph/pygraph.cpp
new file mode 100644
index 00000000..558daa31
--- /dev/null
+++ b/third_party/cudnn-frontend/python/pygraph/pygraph.cpp
@@ -0,0 +1,1117 @@
+#include <utility>
+#include <unordered_map>
+#include <vector>
+
+#include "dlpack/dlpack.h"
+
+// Part of the Array API specification.
+#define CUDNN_FRONTEND_DLPACK_CAPSULE_NAME "dltensor"
+#define CUDNN_FRONTEND_DLPACK_USED_CAPSULE_NAME "used_dltensor"
+
+#include "pybind11/pybind11.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+
+#include "cudnn_frontend.h"
+#include "pygraph.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend::python_bindings {
+
+void
+throw_if(bool const cond, cudnn_frontend::error_code_t const error_code, std::string const& error_msg);
+
+void
+init_pygraph_norm_submodule(py::class_<PyGraph>&);
+
+void
+init_pygraph_sdpa_submodule(py::class_<PyGraph>&);
+
+void
+init_pygraph_pointwise_submodule(py::class_<PyGraph>&);
+
+cudnn_frontend::DataType_t
+convert_to_cudnn_data_type(const DLDataType& dtype) {
+    switch (dtype.code) {
+        case DLDataTypeCode::kDLUInt:
+            switch (dtype.bits) {
+                case 8:
+                    return cudnn_frontend::DataType_t::UINT8;
+            }
+            break;
+        case DLDataTypeCode::kDLInt:
+            switch (dtype.bits) {
+                case 8:
+                    return cudnn_frontend::DataType_t::INT8;
+                case 32:
+                    return cudnn_frontend::DataType_t::INT32;
+                case 64:
+                    return cudnn_frontend::DataType_t::INT64;
+            }
+            break;
+        case DLDataTypeCode::kDLFloat:
+            switch (dtype.bits) {
+                case 16:
+                    return cudnn_frontend::DataType_t::HALF;
+                case 32:
+                    return cudnn_frontend::DataType_t::FLOAT;
+                case 64:
+                    return cudnn_frontend::DataType_t::DOUBLE;
+            }
+            break;
+        case DLDataTypeCode::kDLBfloat:
+            switch (dtype.bits) {
+                case 16:
+                    return cudnn_frontend::DataType_t::BFLOAT16;
+            }
+            break;
+        case DLDataTypeCode::kDLBool:
+            switch (dtype.bits) {
+                case 8:
+                    return cudnn_frontend::DataType_t::BOOLEAN;
+            }
+            break;
+    }
+    return cudnn_frontend::DataType_t::NOT_SET;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::tensor(std::vector<int64_t> const& dim,
+                std::vector<int64_t> const& stride,
+                cudnn_frontend::DataType_t const& data_type,
+                bool const& is_virtual,
+                bool const& is_pass_by_value,
+                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& ragged_offset,
+                cudnn_frontend::TensorReordering_t const reordering_type,
+                std::string const& name,
+                int64_t const& uid) {
+    auto props = cudnn_frontend::graph::Tensor_attributes()
+                     .set_data_type(data_type)
+                     .set_is_virtual(is_virtual)
+                     .set_is_pass_by_value(is_pass_by_value)
+                     .set_dim(dim)
+                     .set_stride(stride)
+                     .set_ragged_offset(ragged_offset)
+                     .set_reordering_type(reordering_type)
+                     .set_name(name);
+
+    if (uid != -1) {
+        props.set_uid(uid);
+    }
+
+    return graph->tensor(props);
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::tensor_like(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& tensor, std::string const& name) {
+    return graph->tensor_like(tensor, name);
+}
+
+static std::intptr_t
+extract_data_pointer(py::object const& obj) {
+    throw_if(!py::hasattr(obj, "__dlpack__"),
+             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+             "Object does not have the __dlpack__() method");
+
+    py::capsule capsule = obj.attr("__dlpack__")();
+    throw_if(capsule.is_none(),
+             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+             "Failed to retrieve the DLPack capsule.");
+
+    DLManagedTensor* managed =
+        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), CUDNN_FRONTEND_DLPACK_CAPSULE_NAME));
+    throw_if(managed == nullptr, cudnn_frontend::error_code_t::INVALID_VARIANT_PACK, "Invalid DLPack capsule.");
+
+    DLDeviceType device_type = managed->dl_tensor.device.device_type;
+    throw_if(
+        device_type != kDLCPU && device_type != kDLCUDAHost && device_type != kDLCUDA && device_type != kDLCUDAManaged,
+        cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+        "Invalid device type.");
+
+    void* p     = (char*)managed->dl_tensor.data + managed->dl_tensor.byte_offset;
+    auto result = reinterpret_cast<std::intptr_t>(p);
+    return result;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::tensor_like(py::object const& pyobj) {
+    throw_if(!py::hasattr(pyobj, "__dlpack__"),
+             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+             "Object does not have the __dlpack__() method");
+
+    py::capsule capsule = pyobj.attr("__dlpack__")();
+    throw_if(capsule.is_none(),
+             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+             "Failed to retrieve the DLPack capsule.");
+
+    DLManagedTensor* managed =
+        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), CUDNN_FRONTEND_DLPACK_CAPSULE_NAME));
+    throw_if(managed == nullptr, cudnn_frontend::error_code_t::INVALID_VARIANT_PACK, "Invalid DLPack capsule.");
+
+    DLDeviceType device_type = managed->dl_tensor.device.device_type;
+    throw_if(
+        device_type != kDLCPU && device_type != kDLCUDAHost && device_type != kDLCUDA && device_type != kDLCUDAManaged,
+        cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+        "Invalid device type.");
+
+    auto ndim = managed->dl_tensor.ndim;
+    std::vector<int64_t> dim(managed->dl_tensor.shape, managed->dl_tensor.shape + ndim);
+
+    auto props = cudnn_frontend::graph::Tensor_attributes()
+                     .set_data_type(convert_to_cudnn_data_type(managed->dl_tensor.dtype))
+                     .set_is_virtual(false)
+                     .set_is_pass_by_value(managed->dl_tensor.device.device_type == kDLCPU)
+                     .set_dim(dim);
+
+    if (managed->dl_tensor.strides == nullptr) {
+        // dlpack says "can be NULL, indicating tensor is compact and row-majored"
+        auto stride_order = detail::generate_row_major_stride_order(ndim);
+        props.set_stride(detail::generate_stride(dim, stride_order));
+    } else {
+        std::vector<int64_t> stride(managed->dl_tensor.strides, managed->dl_tensor.strides + ndim);
+        props.set_stride(stride);
+    }
+
+    return graph->tensor(props);
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::slice(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+               std::vector<py::slice> const& slices,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+    auto input_dim = input->get_dim();
+
+    std::vector<std::pair<int64_t, int64_t>> start_end_indices;
+    for (size_t i = 0; i < slices.size(); ++i) {
+        int64_t start, stop, step, length;
+        if (!slices[i].compute(input_dim[i], &start, &stop, &step, &length)) {
+            throw std::runtime_error("Invalid slice");
+        }
+        start_end_indices.push_back({start, stop});
+    }
+
+    auto attributes = cudnn_frontend::graph::Slice_attributes()
+                          .set_slices(start_end_indices)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto output = graph->slice(input, attributes);
+    return output;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::conv_fprop(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+                    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
+                    std::vector<int64_t> const& pre_padding,
+                    std::vector<int64_t> const& post_padding,
+                    std::vector<int64_t> const& stride,
+                    std::vector<int64_t> const& dilation,
+                    cudnn_frontend::ConvolutionMode_t const& conv_mode,
+                    cudnn_frontend::DataType_t const& compute_data_type,
+                    std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Conv_fprop_attributes()
+                          .set_pre_padding(pre_padding)
+                          .set_post_padding(post_padding)
+                          .set_stride(stride)
+                          .set_dilation(dilation)
+                          .set_convolution_mode(conv_mode)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto Y = graph->conv_fprop(image, weight, attributes);
+    return Y;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::conv_dgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& filter,
+                    std::vector<int64_t> const& pre_padding,
+                    std::vector<int64_t> const& post_padding,
+                    std::vector<int64_t> const& stride,
+                    std::vector<int64_t> const& dilation,
+                    cudnn_frontend::ConvolutionMode_t const& conv_mode,
+                    cudnn_frontend::DataType_t const& compute_data_type,
+                    std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Conv_dgrad_attributes()
+                          .set_pre_padding(pre_padding)
+                          .set_post_padding(post_padding)
+                          .set_stride(stride)
+                          .set_dilation(dilation)
+                          .set_convolution_mode(conv_mode)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+    auto DX = graph->conv_dgrad(loss, filter, attributes);
+    return DX;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::conv_wgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+                    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                    std::vector<int64_t> const& pre_padding,
+                    std::vector<int64_t> const& post_padding,
+                    std::vector<int64_t> const& stride,
+                    std::vector<int64_t> const& dilation,
+                    cudnn_frontend::ConvolutionMode_t const& conv_mode,
+                    cudnn_frontend::DataType_t const& compute_data_type,
+                    std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Conv_wgrad_attributes()
+                          .set_pre_padding(pre_padding)
+                          .set_post_padding(post_padding)
+                          .set_stride(stride)
+                          .set_dilation(dilation)
+                          .set_convolution_mode(conv_mode)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+    auto DW = graph->conv_wgrad(loss, image, attributes);
+    return DW;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::block_scale_dequantize(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale,
+                                std::vector<int32_t> const& block_size,
+                                bool const is_negative_scale,
+                                cudnn_frontend::DataType_t const& compute_data_type,
+                                std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Block_scale_dequantize_attributes()
+                          .set_block_size(block_size)
+                          .set_is_negative_scale(is_negative_scale)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+    if (compute_data_type != cudnn_frontend::DataType_t::NOT_SET) {
+        attributes.set_compute_data_type(compute_data_type);
+    }
+    auto output = graph->block_scale_dequantize(input, descale, attributes);
+    return output;
+}
+
+std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 2>
+PyGraph::block_scale_quantize(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                              int32_t block_size,
+                              std::optional<int64_t> axis,
+                              bool transpose,
+                              cudnn_frontend::DataType_t const& compute_data_type,
+                              std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Block_scale_quantize_attributes()
+                          .set_block_size(block_size)
+                          .set_transpose(transpose)
+                          .set_name(name);
+    if (axis.has_value()) {
+        attributes.set_axis(axis.value());
+    }
+    if (compute_data_type != cudnn_frontend::DataType_t::NOT_SET) {
+        attributes.set_compute_data_type(compute_data_type);
+    }
+    auto outputs = graph->block_scale_quantize(input, attributes);
+    return outputs;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::matmul(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& A,
+                std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& B,
+                cudnn_frontend::DataType_t const& compute_data_type,
+                double const padding,
+                std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Matmul_attributes()
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name)
+                          .set_padding(padding);
+
+    auto C = graph->matmul(A, B, attributes);
+    return C;
+}
+
+std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 2UL>
+PyGraph::genstats(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                  cudnn_frontend::DataType_t const& compute_data_type,
+                  std::string const& name) {
+    auto attributes =
+        cudnn_frontend::graph::Genstats_attributes().set_compute_data_type(compute_data_type).set_name(name);
+
+    auto [SUM, SQ_SUM] = graph->genstats(input, attributes);
+    return {SUM, SQ_SUM};
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::reduction(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                   cudnn_frontend::ReductionMode_t const mode,
+                   cudnn_frontend::DataType_t const& compute_data_type,
+                   std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Reduction_attributes()
+                          .set_mode(mode)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    auto OUT_0 = graph->reduction(input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::reshape(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input, std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Reshape_attributes().set_name(name);
+
+    auto OUT_0 = graph->reshape(input, attributes);
+    return OUT_0;
+}
+
+std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+PyGraph::moe_grouped_matmul(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& token,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& first_token_offset,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& token_index,
+                            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& token_ks,
+                            cudnn_frontend::MoeGroupedMatmulMode_t const& mode,
+                            cudnn_frontend::DataType_t const& compute_data_type,
+                            int32_t const& top_k,
+                            std::string const& name) {
+    auto attributes = cudnn_frontend::graph::Moe_grouped_matmul_attributes()
+                          .set_name(name)
+                          .set_mode(mode)
+                          .set_compute_data_type(compute_data_type)
+                          .set_top_k(top_k);
+
+    auto output = graph->moe_grouped_matmul(token, weight, first_token_offset, token_index, token_ks, attributes);
+    return output;
+}
+
+void
+PyGraph::validate() {
+    auto status = graph->validate();
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+size_t
+PyGraph::key() {
+    return graph->key();
+}
+
+void
+PyGraph::build_operation_graph() {
+    auto status = graph->build_operation_graph(handle);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+std::vector<BehaviorNote_t>
+PyGraph::get_behavior_notes() {
+    std::vector<BehaviorNote_t> notes;
+    auto status = graph->get_behavior_notes(notes);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return notes;
+}
+
+std::vector<BehaviorNote_t>
+PyGraph::get_behavior_notes_for_plan_at_index(int64_t const index) {
+    std::vector<BehaviorNote_t> notes;
+    auto status = graph->get_behavior_notes_for_plan_at_index(index, notes);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return notes;
+}
+
+void
+PyGraph::create_execution_plans(std::vector<cudnn_frontend::HeurMode_t> const& modes) {
+    auto status = graph->create_execution_plans(modes);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+void
+PyGraph::create_execution_plan(int64_t const engine_id, std::unordered_map<KnobType_t, int64_t> const& knobs) {
+    auto status = graph->create_execution_plan(engine_id, knobs);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+int64_t
+PyGraph::get_engine_count() {
+    int64_t engine_count = 0;
+    auto status          = graph->get_engine_count(engine_count);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return engine_count;
+}
+
+std::vector<Knob>
+PyGraph::get_knobs_for_engine(int64_t const engine_id) {
+    std::vector<Knob> knobs;
+    auto status = graph->get_knobs_for_engine(engine_id, knobs);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return knobs;
+}
+
+void
+PyGraph::build_plans(BuildPlanPolicy_t const policy) {
+    // TODO: Add multithreaded support in python
+    auto status = graph->build_plans(policy, false);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+void
+PyGraph::build_plan_at_index(int64_t const index) {
+    auto status = graph->build_plan_at_index(index);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+void
+PyGraph::build(std::vector<cudnn_frontend::HeurMode_t> const& modes) {
+    validate();
+    build_operation_graph();
+    create_execution_plans(modes);
+    check_support();
+    build_plans(cudnn_frontend::BuildPlanPolicy_t::HEURISTICS_CHOICE);
+}
+
+void
+PyGraph::build() {
+    validate();
+    build_operation_graph();
+}
+
+void
+PyGraph::check_support() {
+    auto status = graph->check_support();
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+int64_t
+PyGraph::get_workspace_size() {
+    int64_t workspace = 0;
+
+    auto status = graph->get_workspace_size(workspace);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+
+    return workspace;
+}
+
+int64_t
+PyGraph::get_workspace_size_plan_at_index(int64_t index) {
+    int64_t workspace;
+
+    auto status = graph->get_workspace_size_plan_at_index(index, workspace);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+
+    return workspace;
+}
+
+std::vector<uint8_t>
+PyGraph::serialize() const {
+    std::vector<uint8_t> data;
+    auto status = graph->serialize(data);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return data;
+}
+
+void
+PyGraph::deserialize(std::optional<std::intptr_t> handle_, py::object const& pyobj) {
+    if (py::isinstance<py::str>(pyobj)) {
+        json j = json::parse(pyobj.cast<std::string>());
+
+        auto status = graph->deserialize(j);
+
+        throw_if(status.is_bad(), status.get_code(), status.get_message());
+
+    } else {
+        // If handle is provided, use it (AoT compilation).
+        cudnnHandle_t handle =
+            handle_.has_value() ? static_cast<cudnnHandle_t>((void*)(handle_.value())) : this->handle;
+
+        std::vector<uint8_t> data = pyobj.cast<std::vector<uint8_t>>();
+        auto status               = graph->deserialize(handle, data);
+
+        throw_if(status.is_bad(), status.get_code(), status.get_message());
+    }
+}
+
+void
+PyGraph::deserialize(py::object const& pyobj) {
+    // Call the overloaded version with default handle (nullopt)
+    deserialize(std::nullopt, pyobj);
+}
+
+void
+PyGraph::update_cuda_graph(std::intptr_t handle,
+                           std::unordered_map<cudnn_frontend::graph::Tensor_attributes::uid_t, std::intptr_t> var_pack,
+                           std::intptr_t workspace,
+                           std::intptr_t cuda_graph) {
+    std::unordered_map<int64_t, void*> var_pack_;
+    var_pack_.reserve(var_pack.size());
+    for (auto const& [uid, device_pointer] : var_pack) {
+        var_pack_.emplace(uid, (void*)device_pointer);
+    }
+
+    auto status = graph->update_cuda_graph(reinterpret_cast<cudnnHandle_t>(handle),
+                                           var_pack_,
+                                           reinterpret_cast<void*>(workspace),
+                                           reinterpret_cast<cudaGraph_t>(cuda_graph));
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+
+    return;
+}
+
+void
+PyGraph::populate_cuda_graph(
+    std::intptr_t handle,
+    std::unordered_map<cudnn_frontend::graph::Tensor_attributes::uid_t, std::intptr_t> var_pack,
+    std::intptr_t workspace,
+    std::intptr_t cuda_graph) {
+    std::unordered_map<int64_t, void*> var_pack_;
+    var_pack_.reserve(var_pack.size());
+    for (auto const& [uid, device_pointer] : var_pack) {
+        var_pack_.emplace(uid, (void*)device_pointer);
+    }
+
+    auto status = graph->populate_cuda_graph(reinterpret_cast<cudnnHandle_t>(handle),
+                                             var_pack_,
+                                             reinterpret_cast<void*>(workspace),
+                                             reinterpret_cast<cudaGraph_t>(cuda_graph));
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+
+    return;
+}
+
+void
+PyGraph::execute(std::unordered_map<int64_t, std::intptr_t> var_pack,
+                 std::intptr_t workspace,
+                 std::optional<std::intptr_t> exec_handle,
+                 py::object override_uids,
+                 py::object override_shapes,
+                 py::object override_strides) {
+    std::unordered_map<int64_t, void*> var_pack_;
+    var_pack_.reserve(var_pack.size());
+    for (auto const& [uid, device_pointer] : var_pack) {
+        var_pack_.emplace(uid, (void*)device_pointer);
+    }
+
+    // Convert override_uids to a vector of int64_t (one-liner)
+    std::vector<int64_t> override_uids_vec =
+        override_uids.is_none() ? std::vector<int64_t>() : override_uids.cast<std::vector<int64_t>>();
+    std::vector<std::vector<int64_t>> override_shapes_vec =
+        override_shapes.is_none() ? std::vector<std::vector<int64_t>>()
+                                  : override_shapes.cast<std::vector<std::vector<int64_t>>>();
+    std::vector<std::vector<int64_t>> override_strides_vec =
+        override_strides.is_none() ? std::vector<std::vector<int64_t>>()
+                                   : override_strides.cast<std::vector<std::vector<int64_t>>>();
+
+    auto workspace_ptr = (void*)workspace;
+
+    cudnnHandle_t handle_ = exec_handle.has_value() ? static_cast<cudnnHandle_t>((void*)(exec_handle.value())) : handle;
+
+    cudnn_frontend::error_t status = {error_code_t::OK, ""};
+    if (override_uids_vec.empty()) {
+        status = graph->execute(handle_, var_pack_, workspace_ptr);
+    } else {
+        status = graph->execute(
+            handle_, var_pack_, workspace_ptr, override_uids_vec, override_shapes_vec, override_strides_vec);
+    }
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return;
+}
+
+void
+PyGraph::execute_plan_at_index(std::unordered_map<int64_t, std::intptr_t> var_pack,
+                               std::intptr_t workspace,
+                               int64_t index,
+                               std::optional<std::intptr_t> exec_handle,
+                               py::object override_uids,
+                               py::object override_shapes,
+                               py::object override_strides) {
+    std::unordered_map<int64_t, void*> var_pack_;
+    for (auto const& [uid, device_pointer] : var_pack) {
+        var_pack_.emplace(uid, (void*)device_pointer);
+    }
+
+    // Convert override_uids to a vector of int64_t (one-liner)
+    std::vector<int64_t> override_uids_vec =
+        override_uids.is_none() ? std::vector<int64_t>() : override_uids.cast<std::vector<int64_t>>();
+    std::vector<std::vector<int64_t>> override_shapes_vec =
+        override_shapes.is_none() ? std::vector<std::vector<int64_t>>()
+                                  : override_shapes.cast<std::vector<std::vector<int64_t>>>();
+    std::vector<std::vector<int64_t>> override_strides_vec =
+        override_strides.is_none() ? std::vector<std::vector<int64_t>>()
+                                   : override_strides.cast<std::vector<std::vector<int64_t>>>();
+
+    auto workspace_ptr = (void*)workspace;
+
+    cudnnHandle_t handle_ = exec_handle.has_value() ? static_cast<cudnnHandle_t>((void*)(exec_handle.value())) : handle;
+
+    cudnn_frontend::error_t status = {error_code_t::OK, ""};
+    if (override_uids_vec.empty()) {
+        status = graph->execute_plan_at_index(handle_, var_pack_, workspace_ptr, index);
+    } else {
+        status = graph->execute_plan_at_index(
+            handle_, var_pack_, workspace_ptr, index, override_uids_vec, override_shapes_vec, override_strides_vec);
+    }
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return;
+}
+
+std::shared_ptr<graph::Tensor_attributes>
+PyGraph::query_tensor_attributes_of_uid(int64_t const uid) const {
+    graph::Tensor_attributes tensor;
+    auto status = graph->query_tensor_attributes_of_uid(uid, tensor);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return std::make_shared<graph::Tensor_attributes>(tensor);
+}
+
+std::string
+PyGraph::get_plan_name_at_index(int64_t index) {
+    std::string plan_name;
+    auto status = graph->get_plan_name_at_index(index, plan_name);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return plan_name;
+}
+
+std::vector<int64_t>
+default_vector(void) {
+    return {};
+}
+
+void
+init_pygraph_submodule(py::module_& m) {
+    py::class_<PyGraph> pygraph_(m, "pygraph");
+    pygraph_
+        .def(py::init<std::string const&,
+                      cudnn_frontend::DataType_t,
+                      cudnn_frontend::DataType_t,
+                      cudnn_frontend::DataType_t,
+                      std::optional<std::intptr_t>,
+                      py::object,
+                      py::object,
+                      std::shared_ptr<KernelCache>,
+                      std::shared_ptr<cudnn_frontend::DeviceProperties>,
+                      bool>(),
+             py::arg_v("name", "test_graph"),
+             py::arg_v("io_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("intermediate_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("handle", std::nullopt),
+             py::arg_v("sm_count", py::none()),
+             py::arg_v("sm_version", py::none()),
+             py::arg_v("kernel_cache", nullptr),
+             py::arg_v("device_property", nullptr),
+             py::arg_v("is_dynamic_shape_enabled", false))
+        .def("tensor_like",
+             py::overload_cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const&, std::string const&>(
+                 &PyGraph::tensor_like),
+             py::arg("input"),
+             py::arg_v("name", ""))
+        .def("tensor_like", py::overload_cast<py::object const&>(&PyGraph::tensor_like))
+        .def("_make_tensor",
+             &PyGraph::tensor,
+             py::arg{"dim"},
+             py::arg{"stride"},
+             py::arg_v("data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v{"is_virtual", false},
+             py::arg_v{"is_pass_by_value", false},
+             py::arg_v{"ragged_offset", nullptr},
+             py::arg_v{"reordering_type", cudnn_frontend::TensorReordering_t::NONE},
+             py::arg_v("name", ""),
+             py::arg_v("uid", -1))
+        .def("genstats",
+             &PyGraph::genstats,
+             py::arg("input"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""))
+        .def("slice",
+             &PyGraph::slice,
+             py::arg("input"),
+             py::arg_v{"slices", default_vector()},
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Perform slice operation on the given input tensor.
+
+                Args:
+                    input (cudnn_tensor): The input tensor to be sliced.
+                    slices (List[slice]): A list of Python slice objects, one for each dimension.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation.
+                        Default is NOT_SET.
+                    name (Optional[str]): A name for the slice operation.
+
+                Returns:
+                    cudnn_tensor: The resulting sliced tensor.
+
+                Example:
+                    >>> input_tensor = graph.tensor([4, 8, 16])
+                    >>> sliced_tensor = graph.slice(input_tensor, [slice(0, 2), slice(1, 5), slice(0, 16)])
+            )pbdoc")
+        .def(
+            "conv_fprop",
+            [](PyGraph& self,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
+               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::ConvolutionMode_t const convolution_mode,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+                return self.conv_fprop(
+                    image, weight, padding, padding, stride, dilation, convolution_mode, compute_data_type, name);
+            },
+            py::arg("image"),
+            py::arg("weight"),
+            py::arg_v{"padding", default_vector()},
+            py::arg_v{"stride", default_vector()},
+            py::arg_v{"dilation", default_vector()},
+            py::arg_v{"convolution_mode", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION},
+            py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+            py::arg_v("name", ""))
+        .def("conv_fprop",
+             &PyGraph::conv_fprop,
+             py::arg("image"),
+             py::arg("weight"),
+             py::arg_v{"pre_padding", default_vector()},
+             py::arg_v{"post_padding", default_vector()},
+             py::arg_v{"stride", default_vector()},
+             py::arg_v{"dilation", default_vector()},
+             py::arg_v{"convolution_mode", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION},
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Perform convolution operation with the given inputs.
+
+                Args:
+                    image (cudnn_tensor): The image tensor.
+                    weight (cudnn_tensor): The weight tensor.
+                    pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list.
+                    post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list.
+                    stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
+                    dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation to be performed.
+
+                Returns:
+                    cudnn_tensor: The created tensor.
+            )pbdoc")
+        .def(
+            "conv_wgrad",
+            [](PyGraph& self,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::ConvolutionMode_t const convolution_mode,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+                return self.conv_wgrad(
+                    image, loss, padding, padding, stride, dilation, convolution_mode, compute_data_type, name);
+            },
+            py::arg("image"),
+            py::arg("loss"),
+            py::arg_v{"padding", default_vector()},
+            py::arg_v{"stride", default_vector()},
+            py::arg_v{"dilation", default_vector()},
+            py::arg_v{"convolution_mode", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION},
+            py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+            py::arg_v("name", ""))
+        .def("conv_wgrad",
+             &PyGraph::conv_wgrad,
+             py::arg("image"),
+             py::arg("loss"),
+             py::arg_v{"pre_padding", default_vector()},
+             py::arg_v{"post_padding", default_vector()},
+             py::arg_v{"stride", default_vector()},
+             py::arg_v{"dilation", default_vector()},
+             py::arg_v{"convolution_mode", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION},
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Compute weight gradients using the given inputs and loss.
+
+                Args:
+                    image (cudnn_tensor): The image tensor.
+                    loss (cudnn_tensor): The loss tensor.
+                    pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list.
+                    post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list.                    stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
+                    dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation to be performed.
+
+                Returns:
+                    cudnn_tensor: The created tensor.
+            )pbdoc")
+        .def(
+            "conv_dgrad",
+            [](PyGraph& self,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& filter,
+               std::vector<int64_t> const& padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::ConvolutionMode_t const convolution_mode,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name) {
+                return self.conv_dgrad(
+                    loss, filter, padding, padding, stride, dilation, convolution_mode, compute_data_type, name);
+            },
+            py::arg("loss"),
+            py::arg("filter"),
+            py::arg_v{"padding", default_vector()},
+            py::arg_v{"stride", default_vector()},
+            py::arg_v{"dilation", default_vector()},
+            py::arg_v{"convolution_mode", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION},
+            py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+            py::arg_v("name", ""))
+        .def("conv_dgrad",
+             &PyGraph::conv_dgrad,
+             py::arg("loss"),
+             py::arg("filter"),
+             py::arg_v{"pre_padding", default_vector()},
+             py::arg_v{"post_padding", default_vector()},
+             py::arg_v{"stride", default_vector()},
+             py::arg_v{"dilation", default_vector()},
+             py::arg_v{"convolution_mode", cudnn_frontend::ConvolutionMode_t::CROSS_CORRELATION},
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Compute filter gradients using the given inputs and loss.
+
+                Args:
+                    loss (cudnn_tensor): The loss tensor.
+                    filter (cudnn_tensor): The filter tensor.
+                    pre_padding (Optional[List[int]]): The pre padding values for the operation. Default is an empty list.
+                    post_padding (Optional[List[int]]): The post padding values for the operation. Default is an empty list.
+                    stride (Optional[List[int]]): The stride values for the operation. Default is an empty list.
+                    dilation (Optional[List[int]]): The dilation values for the operation. Default is an empty list.
+                    compute_data_type (Optional[pycudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation to be performed.
+
+                Returns:
+                    cudnn_tensor: The created tensor.
+            )pbdoc")
+        .def("matmul",
+             &PyGraph::matmul,
+             py::arg("A"),
+             py::arg("B"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("padding", 0.0),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Perform matrix multiplication of two tensors A and B.
+
+                Args:
+                    A (cudnn_tensor): The first tensor.
+                    B (cudnn_tensor): The second matrix tensor.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation to be performed.
+
+                Returns:
+                    cudnn_tensor: The result of the matrix multiplication.
+            )pbdoc")
+        .def("reduction",
+             &PyGraph::reduction,
+             py::arg("input"),
+             py::arg("mode"),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Reduce an input tensor along certain dimensions. These dimensions to reduce on are inferred from output tensor shape.
+
+                Args:
+                    input (cudnn_tensor): The input tensor.
+                    mode (cudnn.reduction_mode): The mode to use to reduce along a dimension.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation to be performed.
+
+                Returns:
+                    cudnn_tensor: The result of reduction operation.
+            )pbdoc")
+        .def("block_scale_dequantize",
+             &PyGraph::block_scale_dequantize,
+             py::arg("input"),
+             py::arg("descale"),
+             py::arg("block_size"),
+             py::arg_v("is_negative_scale", false),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Dequantize an input tensor to other dimensions without changing the actual memory layout.
+                
+                Args:
+                    input (cudnn_tensor): The input tensor to dequantize.
+                    descale (cudnn_tensor): The scale tensor for dequantization.
+                    block_size (List[int]): The block size for dequantization.
+                    is_negative_scale (Optional[bool]): Whether the scale values can be negative. Default is False.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation.
+                
+                Returns:
+                    cudnn_tensor: The dequantized output tensor.
+            )pbdoc")
+        .def("block_scale_quantize",
+             &PyGraph::block_scale_quantize,
+             py::arg("input"),
+             py::arg("block_size"),
+             py::arg_v("axis", std::nullopt),
+             py::arg_v("transpose", false),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Quantize an input tensor with block scaling to produce quantized output and scale tensors.
+                
+                Args:
+                    input (cudnn_tensor): The input tensor to quantize.
+                    block_size (int): The block size for quantization.
+                    axis (Optional[int]): The axis along which to quantize. Default is None.
+                    transpose (Optional[bool]): Whether to transpose during quantization. Default is False.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): A name for the operation.
+                
+                Returns:
+                    Tuple[cudnn_tensor, cudnn_tensor]: A tuple of (quantized_output, scale) tensors.
+            )pbdoc")
+        .def("reshape",
+             &PyGraph::reshape,
+             py::arg("input"),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Reshape an input tensor to other dimensions without changing the actual memory layout.
+                These dimensions to reshape to are inferred from output tensor shape.
+
+                Args:
+                    input (cudnn_tensor): The input tensor.
+                    name (Optional[str]): A name for the operation to be performed.
+
+                Returns:
+                    cudnn_tensor: The result of reshape operation. Please set the dims for the output tensor.
+            )pbdoc")
+        .def("moe_grouped_matmul",
+             &PyGraph::moe_grouped_matmul,
+             py::arg("token"),
+             py::arg("weight"),
+             py::arg("first_token_offset"),
+             py::arg_v("token_index", nullptr),
+             py::arg_v("token_ks", nullptr),
+             py::arg_v("mode", cudnn_frontend::MoeGroupedMatmulMode_t::NONE),
+             py::arg_v("compute_data_type", cudnn_frontend::DataType_t::FLOAT),
+             py::arg_v("top_k", 0),
+             py::arg_v("name", ""),
+             R"pbdoc(
+                Perform MoE Grouped Matmul operation.
+
+                Args:
+                    token (cudnn_tensor): The token tensor.
+                    weight (cudnn_tensor): The weight tensor.
+                    first_token_offset (cudnn_tensor): The first token offset tensor.
+                    token_index (cudnn_tensor): The token index tensor or nullptr.
+                    token_ks (cudnn_tensor): The token ks tensor or nullptr.
+                    mode (cudnn.moe_grouped_matmul_mode): The mode of the operation.
+                    compute_data_type (cudnn.data_type): The data type for computation.
+                    top_k (int): The top k value.
+                    name (str): The name of the operation.
+            )pbdoc")
+        .def("get_behavior_notes", &PyGraph::get_behavior_notes)
+        .def("get_behavior_notes_for_plan_at_index", &PyGraph::get_behavior_notes_for_plan_at_index)
+        .def("deselect_engines", &PyGraph::deselect_engines)
+        .def("deselect_numeric_notes", &PyGraph::deselect_numeric_notes)
+        .def("deselect_behavior_notes", &PyGraph::deselect_behavior_notes)
+        .def("select_numeric_notes", &PyGraph::select_numeric_notes)
+        .def("select_behavior_notes", &PyGraph::select_behavior_notes)
+        .def("deselect_workspace_greater_than", &PyGraph::deselect_workspace_greater_than)
+        .def("validate", &PyGraph::validate)
+        .def("key", &PyGraph::key)
+        .def("build_operation_graph", &PyGraph::build_operation_graph)
+        .def("create_execution_plans", &PyGraph::create_execution_plans)
+        .def("create_execution_plan",
+             &PyGraph::create_execution_plan,
+             R"pbdoc(
+                Gets the knob configurations available for the given engine.
+                Args:
+                    engine_id (int): The ID of the engine to create the execution plan on.
+                    knobs (dict[Knob, int]): The map of knobs to knob values.
+            )pbdoc")
+        .def("get_engine_count", &PyGraph::get_engine_count)
+        .def("get_knobs_for_engine",
+             &PyGraph::get_knobs_for_engine,
+             R"pbdoc(
+                Gets the knob configurations available for the given engine.
+                Args:
+                    engine_id (int): The ID of the engine to query knob configurations for.
+            )pbdoc")
+        .def("check_support", &PyGraph::check_support)
+        .def("build_plans",
+             &PyGraph::build_plans,
+             py::arg("policy") = cudnn_frontend::BuildPlanPolicy_t::HEURISTICS_CHOICE)
+        .def("build_plan_at_index",
+             &PyGraph::build_plan_at_index,
+             py::arg("index"),
+             R"pbdoc(
+                Build a plan at the given index.
+                Args:
+                    index (int): The index of the plan to build.
+            )pbdoc")
+        .def("build", (void (PyGraph::*)(std::vector<cudnn_frontend::HeurMode_t> const&))&PyGraph::build)
+        .def("build", (void (PyGraph::*)())&PyGraph::build)
+        .def("get_execution_plan_count",
+             &PyGraph::get_execution_plan_count,
+             R"pbdoc(
+                Get the number of execution plan candidates.
+            )pbdoc")
+        .def("get_workspace_size", &PyGraph::get_workspace_size)
+        .def("get_workspace_size_plan_at_index",
+             &PyGraph::get_workspace_size_plan_at_index,
+             py::arg("index"),
+             R"pbdoc(
+                Get workspace for a plan at the given index.
+                Args:
+                    index (int): The index of the plan to get workspace from.
+                    If the graph is not built at the index, this will return 0.
+            )pbdoc")
+        .def("query_tensor_attributes_of_uid",
+             &PyGraph::query_tensor_attributes_of_uid,
+             py::arg("uid"),
+             R"pbdoc(
+                    Get tensor_attributes for a given UID
+                    Args:
+                    uid (int): The uid of tensor to be queried
+                    If the graph does not have the UID, this will raise an error
+                )pbdoc")
+        .def("get_plan_name_at_index",
+             &PyGraph::get_plan_name_at_index,
+             py::arg("index"),
+             R"pbdoc(
+                    Get the name for a plan at the given index.
+                    Args:
+                    index (int): The index of the plan to get workspace from.
+                )pbdoc")
+        .def("_execute",
+             &PyGraph::execute,
+             py::arg("var_pack"),
+             py::arg("workspace"),
+             py::arg("handle"),
+             py::arg("override_uids")    = py::none(),
+             py::arg("override_shapes")  = py::none(),
+             py::arg("override_strides") = py::none())
+        .def("populate_cuda_graph", &PyGraph::populate_cuda_graph)
+        .def("update_cuda_graph", &PyGraph::update_cuda_graph)
+        .def("serialize", &PyGraph::serialize)
+        .def("deserialize",
+             (void (PyGraph::*)(std::optional<std::intptr_t>, py::object const&))&PyGraph::deserialize,
+             py::arg("handle_"),
+             py::arg("pyobj"))
+        .def("deserialize", (void (PyGraph::*)(py::object const&))&PyGraph::deserialize, py::arg("pyobj"))
+        .def("_execute_plan_at_index",
+             &PyGraph::execute_plan_at_index,
+             py::arg("var_pack"),
+             py::arg("workspace"),
+             py::arg("index"),
+             py::arg("handle"),
+             py::arg("override_uids")    = py::none(),
+             py::arg("override_shapes")  = py::none(),
+             py::arg("override_strides") = py::none())
+        .def("__repr__", [](PyGraph const& pygraph) {
+            std::stringstream ss;
+            json j = pygraph.graph;
+            ss << j.dump(4);
+            return ss.str();
+        });
+
+    m.def("_get_data_ptr", &extract_data_pointer);
+
+    init_pygraph_norm_submodule(pygraph_);
+    init_pygraph_sdpa_submodule(pygraph_);
+    init_pygraph_pointwise_submodule(pygraph_);
+}
+
+}  // namespace cudnn_frontend::python_bindings
diff --git a/third_party/cudnn-frontend/python/pygraph/pygraph.h b/third_party/cudnn-frontend/python/pygraph/pygraph.h
new file mode 100644
index 00000000..a473ffe4
--- /dev/null
+++ b/third_party/cudnn-frontend/python/pygraph/pygraph.h
@@ -0,0 +1,670 @@
+#include <utility>
+#include <unordered_map>
+#include <vector>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/functional.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+
+#include "cudnn_frontend.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend::python_bindings {
+
+// This class is only meant direct pythonic API calls to c++ Graph class.
+class PyGraph {
+   public:
+    using Tensor_t   = std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>;
+    using Graph_t    = std::shared_ptr<cudnn_frontend::graph::Graph>;
+    using PyCallback = std::function<Tensor_t(PyGraph&, Tensor_t)>;
+
+    template <cudnn_frontend::PointwiseMode_t MODE>
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    pointwise_ternary(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& a,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& b,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& c,
+                      cudnn_frontend::DataType_t const& compute_data_type,
+                      std::string const& name);
+
+    template <cudnn_frontend::PointwiseMode_t MODE>
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    pointwise_binary(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& a,
+                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& b,
+                     cudnn_frontend::DataType_t const& compute_data_type,
+                     std::string const& name);
+
+    template <cudnn_frontend::PointwiseMode_t MODE>
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    pointwise_unary(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& a,
+                    cudnn_frontend::DataType_t const& compute_data_type,
+                    std::string const& name);
+
+    // This Graph class is the sole structure which implicitly makes PyGraph own all tensors, nodes, and cudnn
+    // descriptors.
+    Graph_t graph;
+    cudnnHandle_t handle = nullptr;
+    bool is_handle_owner = false;
+
+    std::optional<PyCallback> callback_fn;
+
+    PyGraph(Graph_t graph_) : graph(graph_) {};
+
+    PyGraph(std::string const&,
+            cudnn_frontend::DataType_t io_data_type,
+            cudnn_frontend::DataType_t intermediate_data_type,
+            cudnn_frontend::DataType_t compute_data_type,
+            std::optional<std::intptr_t> handle_,
+            py::object sm_count,
+            py::object sm_version,
+            std::shared_ptr<KernelCache> kernel_cache,
+            std::shared_ptr<cudnn_frontend::DeviceProperties> device_properties,
+            bool is_dynamic_shape_enabled)
+        : graph(std::make_shared<cudnn_frontend::graph::Graph>()) {
+        graph->set_compute_data_type(compute_data_type)
+            .set_intermediate_data_type(intermediate_data_type)
+            .set_io_data_type(io_data_type);
+
+        // If device_properties is set, use it (consider it is an AoT compilation test).
+        if (device_properties != nullptr) {
+            graph->set_device_properties(device_properties);
+        } else if (handle_.has_value()) {
+            handle = static_cast<cudnnHandle_t>((void*)(handle_.value()));
+        } else {
+            detail::create_handle(&handle);
+            is_handle_owner = true;
+        }
+
+        if (sm_count.is(py::none()) == false) {
+            graph->set_sm_count(sm_count.cast<int32_t>());
+        }
+
+        if (sm_version.is(py::none()) == false) {
+            graph->set_sm_version(sm_version.cast<int32_t>());
+        }
+
+        if (is_dynamic_shape_enabled) {
+            graph->set_dynamic_shape_enabled(true);
+        }
+
+        if (kernel_cache) {
+            graph->set_kernel_cache(kernel_cache);
+            graph->set_dynamic_shape_enabled(true);
+        }
+    }
+
+    ~PyGraph() {
+        if (is_handle_owner) {
+            detail::destroy_handle(handle);
+        }
+    }
+
+    std::function<Tensor_t(Graph_t, Tensor_t)> wrapper_function = [this](Graph_t graph, Tensor_t q_kt) {
+        auto py_graph = std::make_shared<PyGraph>(graph);
+
+        if (callback_fn.has_value()) {
+            q_kt = this->callback_fn.value()(*py_graph, q_kt);
+        }
+
+        return q_kt;
+    };
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    tensor(std::vector<int64_t> const& dim,
+           std::vector<int64_t> const& stride,
+           cudnn_frontend::DataType_t const& data_type,
+           bool const& is_virtual,
+           bool const& is_pass_by_value,
+           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& ragged_offset,
+           cudnn_frontend::TensorReordering_t const reordering_type,
+           std::string const& name,
+           int64_t const& uid);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    tensor_like(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& pyobj, std::string const&);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    tensor_like(py::object const& pyobj);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    batchnorm(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& in_running_mean,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& in_running_var,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& momentum,
+              std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>& peer_stats,
+              cudnn_frontend::DataType_t const& compute_data_type,
+              std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    layernorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+              cudnn_frontend::DataType_t const& compute_data_type,
+              std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    batchnorm_inference(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                        std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& mean,
+                        std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& inv_variance,
+                        std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                        std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                        cudnn_frontend::DataType_t const& compute_data_type,
+                        std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    layernorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                       cudnn_frontend::DataType_t const& compute_data_type,
+                       std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    adalayernorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                 cudnn_frontend::DataType_t const& compute_data_type,
+                 std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    adalayernorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                          cudnn_frontend::DataType_t const& compute_data_type,
+                          std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    batchnorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                       std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>& peer_stats,
+                       cudnn_frontend::DataType_t const& compute_data_type,
+                       std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    slice(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+          std::vector<py::slice> const& slices,
+          cudnn_frontend::DataType_t const& compute_data_type,
+          std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    conv_fprop(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
+               std::vector<int64_t> const& pre_padding,
+               std::vector<int64_t> const& post_padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::ConvolutionMode_t const& conv_mode,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    conv_dgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& filter,
+               std::vector<int64_t> const& pre_padding,
+               std::vector<int64_t> const& post_padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::ConvolutionMode_t const& conv_mode,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    conv_wgrad(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& image,
+               std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+               std::vector<int64_t> const& pre_padding,
+               std::vector<int64_t> const& post_padding,
+               std::vector<int64_t> const& stride,
+               std::vector<int64_t> const& dilation,
+               cudnn_frontend::ConvolutionMode_t const& conv_mode,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    matmul(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& A,
+           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& B,
+           cudnn_frontend::DataType_t const& compute_data_type,
+           double const padding,
+           std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    relu(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+         std::optional<float> const& negative_slope,
+         std::optional<float> const& lower_clip,
+         std::optional<float> const& upper_clip,
+         cudnn_frontend::DataType_t const& compute_data_type,
+         std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    swish(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+          cudnn_frontend::DataType_t const& compute_data_type,
+          std::optional<float> const& swish_beta,
+          std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    swish_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                   std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                   cudnn_frontend::DataType_t const& compute_data_type,
+                   std::optional<float> const& swish_beta,
+                   std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    block_scale_dequantize(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale,
+                           std::vector<int32_t> const& block_size,
+                           bool const is_negative_scale,
+                           cudnn_frontend::DataType_t const& compute_data_type,
+                           std::string const& name);
+
+    std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 2>
+    block_scale_quantize(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                         int32_t block_size,
+                         std::optional<int64_t> axis,
+                         bool transpose,
+                         cudnn_frontend::DataType_t const& compute_data_type,
+                         std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    gen_index(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+              int64_t const axis,
+              cudnn_frontend::DataType_t const& compute_data_type,
+              std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    relu_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+
+                  std::optional<float> const& negative_slope,
+                  std::optional<float> const& lower_clip,
+                  std::optional<float> const& upper_clip,
+                  cudnn_frontend::DataType_t const& compute_data_type,
+                  std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    leaky_relu_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& loss,
+                        std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+                        float const negative_slope,
+                        cudnn_frontend::DataType_t const& compute_data_type,
+                        std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    leaky_relu(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+               float const negative_slope,
+               cudnn_frontend::DataType_t const& compute_data_type,
+               std::string const& name);
+
+    std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 2UL>
+    genstats(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+             cudnn_frontend::DataType_t const& compute_data_type,
+             std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    reduction(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input,
+              cudnn_frontend::ReductionMode_t const mode,
+              cudnn_frontend::DataType_t const& compute_data_type,
+              std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    reshape(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& input, std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    rmsnorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+            cudnn_frontend::DataType_t const& compute_data_type,
+            std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    rmsnorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                     std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                     bool const has_dbias,
+                     cudnn_frontend::DataType_t const& compute_data_type,
+                     std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    instancenorm(cudnn_frontend::NormFwdPhase_t const forward_phase,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& x,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& epsilon,
+                 cudnn_frontend::DataType_t const& compute_data_type,
+                 std::string const& name);
+
+    std::vector<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>
+    instancenorm_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& dy,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& x,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& scale,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& mean,
+                          std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const& inv_variance,
+                          cudnn_frontend::DataType_t const& compute_data_type,
+                          std::string const& name);
+
+    // return [o, stats]
+    std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 2>
+    sdpa(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+         py::object const& is_inference,
+         py::object const& attn_scale,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& block_mask,
+         bool const use_alibi_mask,
+         bool const use_padding_mask,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+         bool const use_causal_mask,
+         bool const use_causal_mask_bottom_right,
+         py::object const& sliding_window_length,
+         cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+         py::object const& left_bound,
+         py::object const& right_bound,
+         py::object const& dropout,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_k_table,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_v_table,
+         py::object const& paged_attention_max_seq_len_kv,
+         cudnn_frontend::DataType_t const& compute_data_type,
+         std::string const& name,
+         std::optional<PyCallback> fn,
+         py::object const& generate_stats,
+         cudnn_frontend::AttentionImplementation_t const& implementation,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_max,
+         std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_sum_exp);
+
+    // return [dQ, dK, dV]
+    std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 3>
+    sdpa_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& o,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& dO,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& stats,
+                  py::object const& attn_scale,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& dBias,
+                  bool const use_alibi_mask,
+                  bool const use_padding_mask,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                  py::object const& max_total_seq_len_q,
+                  py::object const& max_total_seq_len_kv,
+                  bool const use_causal_mask,
+                  bool const use_causal_mask_bottom_right,
+                  py::object const& sliding_window_length,
+                  cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+                  py::object const& left_bound,
+                  py::object const& right_bound,
+                  py::object const& dropout,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+                  bool const use_deterministic_algorithm,
+                  cudnn_frontend::DataType_t const& compute_data_type,
+                  std::string const& name);
+
+    // return [o, stats, amax_s, amax_o]
+    std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 4>
+    sdpa_fp8(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_q,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_k,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_v,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_s,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_s,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_o,
+             py::object const& is_inference,
+             py::object const& attn_scale,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+             bool const use_alibi_mask,
+             bool const use_padding_mask,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+             bool const use_causal_mask,
+             bool const use_causal_mask_bottom_right,
+             py::object const& sliding_window,
+             cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+             py::object const& left_bound,
+             py::object const& right_bound,
+             py::object const& dropout,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_k_table,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_v_table,
+             py::object const& paged_attention_max_seq_len_kv,
+             cudnn_frontend::DataType_t const& compute_data_type,
+             std::string const& name,
+             std::optional<PyCallback> fn,
+             py::object const& generate_stats,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_max,
+             std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_sum_exp);
+
+    // return [dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP]
+    std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 7>
+    sdpa_fp8_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& o,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& dO,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& stats,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_q,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_k,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_v,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_o,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_dO,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_s,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_dP,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_s,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dQ,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dK,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dV,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dP,
+                      py::object const& attn_scale,
+                      bool const use_padding_mask,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                      std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                      bool const use_causal_mask,
+                      bool const use_causal_mask_bottom_right,
+                      bool const use_deterministic_algorithm,
+                      py::object const& dropout,
+                      cudnn_frontend::DataType_t const& compute_data_type,
+                      std::string const& name);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    moe_grouped_matmul(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& token,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& weight,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& first_token_offset,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& token_index,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& token_ks,
+                       cudnn_frontend::MoeGroupedMatmulMode_t const& mode,
+                       cudnn_frontend::DataType_t const& compute_data_type,
+                       int32_t const& top_k,
+                       std::string const& name);
+
+    void
+    validate();
+
+    size_t
+    key();
+
+    void
+    build_operation_graph();
+
+    void
+    create_execution_plans(std::vector<cudnn_frontend::HeurMode_t> const&);
+
+    void
+    create_execution_plan(int64_t const engine_id, std::unordered_map<KnobType_t, int64_t> const& knobs);
+
+    int64_t
+    get_engine_count();
+
+    std::vector<Knob>
+    get_knobs_for_engine(int64_t const engine_id);
+
+    void
+    build_plans(BuildPlanPolicy_t const);
+
+    void
+    build_plan_at_index(int64_t const index);
+
+    void
+    check_support();
+
+    void
+    build(std::vector<cudnn_frontend::HeurMode_t> const&);
+
+    void
+    build();
+
+    int64_t
+    get_workspace_size();
+
+    void
+    populate_cuda_graph(std::intptr_t handle,
+                        std::unordered_map<cudnn_frontend::graph::Tensor_attributes::uid_t, int64_t> var_pack,
+                        std::intptr_t workspace,
+                        std::intptr_t cuda_graph);
+
+    void
+    update_cuda_graph(std::intptr_t handle,
+                      std::unordered_map<cudnn_frontend::graph::Tensor_attributes::uid_t, int64_t> var_pack,
+                      std::intptr_t workspace,
+                      std::intptr_t cuda_graph);
+
+    void
+    execute(std::unordered_map<int64_t, int64_t> var_pack,
+            int64_t workspace,
+            std::optional<std::intptr_t>,
+            py::object override_uids    = py::none(),
+            py::object override_shapes  = py::none(),
+            py::object override_strides = py::none());
+
+    void
+    execute_plan_at_index(std::unordered_map<int64_t, int64_t> var_pack,
+                          int64_t workspace,
+                          int64_t index,
+                          std::optional<std::intptr_t>,
+                          py::object override_uids    = py::none(),
+                          py::object override_shapes  = py::none(),
+                          py::object override_strides = py::none());
+
+    std::vector<BehaviorNote_t>
+    get_behavior_notes();
+
+    std::vector<BehaviorNote_t>
+    get_behavior_notes_for_plan_at_index(int64_t const index);
+
+    void
+    select_numeric_notes(std::vector<NumericalNote_t> const& notes) {
+        graph->select_numeric_notes(notes);
+        return;
+    }
+
+    void
+    select_behavior_notes(std::vector<BehaviorNote_t> const& notes) {
+        graph->select_behavior_notes(notes);
+        return;
+    }
+
+    void
+    deselect_engines(std::vector<std::string> const& engine_names) {
+        graph->deselect_engines(engine_names);
+        return;
+    }
+
+    void
+    deselect_numeric_notes(std::vector<NumericalNote_t> const& notes) {
+        graph->deselect_numeric_notes(notes);
+        return;
+    }
+
+    void
+    deselect_behavior_notes(std::vector<BehaviorNote_t> const& notes) {
+        graph->deselect_behavior_notes(notes);
+        return;
+    }
+
+    void
+    deselect_workspace_greater_than(int64_t const workspace) {
+        graph->deselect_workspace_greater_than(workspace);
+        return;
+    }
+
+    std::vector<uint8_t>
+    serialize() const;
+
+    void
+    deserialize(std::optional<std::intptr_t> handle_, py::object const& pyobj);
+
+    void
+    deserialize(py::object const& pyobj);
+
+    int64_t
+    get_execution_plan_count() const {
+        return graph->get_execution_plan_count();
+    }
+
+    int64_t
+    get_workspace_size_plan_at_index(int64_t index);
+
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
+    query_tensor_attributes_of_uid(int64_t const uid) const;
+
+    std::string
+    get_plan_name_at_index(int64_t index);
+
+   private:
+    // Internal SDPA implementation - delegates to sdpa() or sdpa_fp8() based on mma_core_mode
+    // return SDPA_outputs struct: {O, Stats, RNG_DUMP, Amax_S, Amax_O}
+    cudnn_frontend::graph::SDPA_attributes::SDPA_outputs
+    sdpa_internal(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                  py::object const& attn_scale,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& block_mask,
+                  bool const use_alibi_mask,
+                  bool const use_padding_mask,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                  cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+                  py::object const& left_bound,
+                  py::object const& right_bound,
+                  py::object const& dropout,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_k_table,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_v_table,
+                  py::object const& paged_attention_max_seq_len_kv,
+                  cudnn_frontend::DataType_t const& compute_data_type,
+                  std::string const& name,
+                  std::optional<PyCallback> fn,
+                  py::object const& generate_stats,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_max,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_sum_exp,
+                  cudnn_frontend::DataType_t const& mma_core_mode = cudnn_frontend::DataType_t::HALF,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_q = nullptr,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_k = nullptr,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_v = nullptr,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_s = nullptr,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> scale_s   = nullptr,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> scale_o   = nullptr,
+                  cudnn_frontend::AttentionImplementation_t const& implementation = AttentionImplementation_t::AUTO);
+};
+
+}  // namespace cudnn_frontend::python_bindings
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/python/pygraph/sdpa.cpp b/third_party/cudnn-frontend/python/pygraph/sdpa.cpp
new file mode 100644
index 00000000..6eb11c57
--- /dev/null
+++ b/third_party/cudnn-frontend/python/pygraph/sdpa.cpp
@@ -0,0 +1,998 @@
+#include <utility>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/cast.h"
+#include "pybind11/stl.h"
+
+#include "cudnn_frontend.h"
+#include "pygraph.h"
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+namespace cudnn_frontend::python_bindings {
+
+// Internal SDPA implementation - delegates to sdpa() or sdpa_fp8() based on mma_core_mode
+cudnn_frontend::graph::SDPA_attributes::SDPA_outputs
+PyGraph::sdpa_internal(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                       py::object const& attn_scale,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& block_mask,
+                       bool const use_alibi_mask,
+                       bool const use_padding_mask,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                       cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+                       py::object const& left_bound,
+                       py::object const& right_bound,
+                       py::object const& dropout,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_k_table,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_v_table,
+                       py::object const& paged_attention_max_seq_len_kv,
+                       cudnn_frontend::DataType_t const& compute_data_type,
+                       std::string const& name,
+                       std::optional<PyCallback> fn,
+                       py::object const& generate_stats,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_max,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_sum_exp,
+                       cudnn_frontend::DataType_t const& mma_core_mode,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_q,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_k,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_v,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_s,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> scale_s,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> scale_o,
+                       cudnn_frontend::AttentionImplementation_t const& implementation) {
+    auto attributes = cudnn_frontend::graph::SDPA_attributes()
+                          .set_bias(bias)
+                          .set_alibi_mask(use_alibi_mask)
+                          .set_padding_mask(use_padding_mask)
+                          .set_seq_len_q(seq_len_q)
+                          .set_seq_len_kv(seq_len_kv)
+                          .set_diagonal_alignment(diagonal_alignment)
+                          .set_compute_data_type(compute_data_type)
+                          ._set_mma_core_mode(mma_core_mode)
+                          .set_name(name)
+                          .set_implementation(implementation)
+                          .set_logit_max(score_max)
+                          .set_score_sum_exp(score_sum_exp);
+
+    if (block_mask) {
+        attributes.set_block_mask(block_mask);
+    }
+
+    // Set generate_stats
+    if (!generate_stats.is_none()) {
+        if (py::isinstance<py::bool_>(generate_stats)) {
+            attributes.set_generate_stats(generate_stats.cast<bool>());
+        } else {
+            throw std::runtime_error("generate_stats must be a bool.");
+        }
+    } else {
+        throw std::runtime_error("generate_stats must be provided.");
+    }
+
+    // Paged attention features
+    // Note: previously fp16 only, newly enabled for fp8
+    if (paged_attention_k_table) {
+        attributes.set_paged_attention_k_table(paged_attention_k_table);
+    }
+
+    if (paged_attention_v_table) {
+        attributes.set_paged_attention_v_table(paged_attention_v_table);
+    }
+
+    if (!paged_attention_max_seq_len_kv.is_none()) {
+        if (py::isinstance<py::int_>(paged_attention_max_seq_len_kv)) {
+            attributes.set_paged_attention_max_seq_len_kv(paged_attention_max_seq_len_kv.cast<int>());
+        } else {
+            throw std::runtime_error("paged_attention_max_seq_len_kv must be an int (or None)");
+        }
+    }
+
+    // Set attn_scale
+    if (!attn_scale.is_none()) {
+        if (py::isinstance<py::float_>(attn_scale)) {
+            auto const attn_scale_value = attn_scale.cast<float>();
+            attributes.set_attn_scale(attn_scale_value);
+        } else {
+            auto const attn_scale_tensor = attn_scale.cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            if (!attn_scale_tensor) {
+                throw std::runtime_error("attn_scale must be a cudnn_tensor or float.");
+            }
+            attributes.set_attn_scale(attn_scale_tensor);
+        }
+    }
+
+    // Set diagonal masking bounds
+    // Note: previously fp16 only, newly enabled for fp8
+    if (!left_bound.is_none()) {
+        if (py::isinstance<py::int_>(left_bound)) {
+            attributes.set_diagonal_band_left_bound(left_bound.cast<int64_t>());
+        } else {
+            throw std::runtime_error("diagonal_band_left_bound must be an int (or None)");
+        }
+    }
+
+    if (!right_bound.is_none()) {
+        if (py::isinstance<py::int_>(right_bound)) {
+            attributes.set_diagonal_band_right_bound(right_bound.cast<int64_t>());
+        } else {
+            throw std::runtime_error("diagonal_band_right_bound must be an int (or None)");
+        }
+    }
+
+    // Set dropout
+    if (!dropout.is_none()) {
+        py::tuple dropout_tuple = dropout.cast<py::tuple>();
+        if ((!dropout_tuple) || (dropout_tuple.size() != 3 && dropout_tuple.size() != 2)) {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor, and an offset tensor) or (mask "
+                "tensor, scale tensor)");
+        }
+        if (py::isinstance<py::float_>(dropout_tuple[0])) {
+            auto const probability = dropout_tuple[0].cast<float>();
+            auto const seed        = dropout_tuple[1].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const offset      = dropout_tuple[2].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+
+            if (!seed) {
+                throw std::runtime_error("dropout seed must be a cudnn_tensor.");
+            }
+            if (!offset) {
+                throw std::runtime_error("dropout offset must be a cudnn_tensor.");
+            }
+            attributes.set_dropout(probability, seed, offset);
+            if (rng_dump) {
+                attributes.set_rng_dump(rng_dump);
+            }
+        } else {
+            auto const mask = dropout_tuple[0].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            if (!mask) {
+                throw std::runtime_error("dropout mask must be a cudnn_tensor.");
+            }
+
+            auto const scale = dropout_tuple[1].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            if (!scale) {
+                throw std::runtime_error("dropout scale must be a cudnn_tensor.");
+            }
+
+            attributes.set_dropout(mask, scale);
+        }
+    }
+
+    // Score modification callback
+    // Note: previously fp16 only, newly enabled for fp8
+    if (fn.has_value()) {
+        attributes.set_score_mod(wrapper_function);
+        callback_fn = fn;
+    }
+
+    // Check mma_core_mode to decide which Graph method to call
+    if (mma_core_mode == cudnn_frontend::DataType_t::FP8_E4M3 ||
+        mma_core_mode == cudnn_frontend::DataType_t::FP8_E5M2) {
+        attributes.inputs[cudnn_frontend::graph::SDPA_attributes::input_names::Descale_Q] = descale_q;
+        attributes.inputs[cudnn_frontend::graph::SDPA_attributes::input_names::Descale_K] = descale_k;
+        attributes.inputs[cudnn_frontend::graph::SDPA_attributes::input_names::Descale_V] = descale_v;
+        attributes.inputs[cudnn_frontend::graph::SDPA_attributes::input_names::Descale_S] = descale_s;
+        attributes.inputs[cudnn_frontend::graph::SDPA_attributes::input_names::Scale_S]   = scale_s;
+        attributes.inputs[cudnn_frontend::graph::SDPA_attributes::input_names::Scale_O]   = scale_o;
+
+        auto result_array =
+            graph->sdpa_fp8(q, k, v, descale_q, descale_k, descale_v, descale_s, scale_s, scale_o, attributes);
+
+        cudnn_frontend::graph::SDPA_attributes::SDPA_outputs result;
+        result.O        = result_array[0];
+        result.Stats    = result_array[1];
+        result.RNG_DUMP = nullptr;
+        result.Amax_S   = result_array[2];
+        result.Amax_O   = result_array[3];
+        return result;
+
+    } else if (mma_core_mode == cudnn_frontend::DataType_t::HALF) {
+        auto result_array = graph->sdpa(q, k, v, attributes);
+
+        cudnn_frontend::graph::SDPA_attributes::SDPA_outputs result;
+        result.O        = result_array[0];
+        result.Stats    = result_array[1];
+        result.RNG_DUMP = nullptr;
+        result.Amax_S   = nullptr;
+        result.Amax_O   = nullptr;
+        return result;
+    } else {
+        throw std::runtime_error(
+            "Unsupported MMA core mode. Supported modes are: DataType_t::HALF (FP16), "
+            "DataType_t::FP8_E4M3, or DataType_t::FP8_E5M2.");
+    }
+}
+
+std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 2>
+PyGraph::sdpa(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+              py::object const& is_inference,
+              py::object const& attn_scale,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& block_mask,
+              bool const use_alibi_mask,
+              bool const use_padding_mask,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+              bool const use_causal_mask,
+              bool const use_causal_mask_bottom_right,
+              py::object const& sliding_window,
+              cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+              py::object const& left_bound,
+              py::object const& right_bound,
+              py::object const& dropout,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_k_table,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_v_table,
+              py::object const& paged_attention_max_seq_len_kv,
+              cudnn_frontend::DataType_t const& compute_data_type,
+              std::string const& name,
+              std::optional<PyCallback> fn,
+              py::object const& generate_stats,
+              cudnn_frontend::AttentionImplementation_t const& implementation,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_max,
+              std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_sum_exp) {
+    cudnn_frontend::DataType_t mma_core_mode                            = cudnn_frontend::DataType_t::HALF;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_q = nullptr;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_k = nullptr;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_v = nullptr;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> descale_s = nullptr;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> scale_s   = nullptr;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> scale_o   = nullptr;
+
+    // Handle sliding_window to left_bound mapping for backward compatibility
+    py::object actual_left_bound = left_bound;
+    if (!sliding_window.is_none() && !left_bound.is_none()) {
+        throw std::runtime_error("sliding window and left_bound cannot be set at the same time");
+    }
+    if (!sliding_window.is_none()) {
+        actual_left_bound = sliding_window;
+    }
+
+    // Handle is_inference/generate_stats conversion for backward compatibility
+    py::object actual_generate_stats = generate_stats;
+    if (generate_stats.is_none() == is_inference.is_none()) {
+        throw std::runtime_error("Exactly one of {generate_stats, is_inference} must be set (prefer generate_stats).");
+    }
+    if (!is_inference.is_none()) {
+        if (py::isinstance<py::bool_>(is_inference)) {
+            actual_generate_stats = py::bool_(!is_inference.cast<bool>());
+        } else {
+            throw std::runtime_error("is_inference must be a bool.");
+        }
+    }
+
+    // Handle deprecated causal mask conversion for backward compatibility
+    cudnn_frontend::DiagonalAlignment_t actual_diagonal_alignment = diagonal_alignment;
+    py::object actual_right_bound                                 = right_bound;
+
+    if (use_causal_mask && use_causal_mask_bottom_right) {
+        throw std::runtime_error("use_causal_mask and use_causal_mask_bottom_right cannot both be true");
+    }
+
+    if (use_causal_mask && !right_bound.is_none()) {
+        throw std::runtime_error("use_causal_mask and diagonal_band_right_bound cannot be set at the same time");
+    }
+
+    if (use_causal_mask_bottom_right && !right_bound.is_none()) {
+        throw std::runtime_error(
+            "use_causal_mask_bottom_right and diagonal_band_right_bound cannot be set at the same time");
+    }
+
+    if (use_causal_mask) {
+        actual_diagonal_alignment = cudnn_frontend::DiagonalAlignment_t::TOP_LEFT;
+        actual_right_bound        = py::int_(0);
+    }
+
+    if (use_causal_mask_bottom_right) {
+        actual_diagonal_alignment = cudnn_frontend::DiagonalAlignment_t::BOTTOM_RIGHT;
+        actual_right_bound        = py::int_(0);
+    }
+
+    auto internal_result = sdpa_internal(q,
+                                         k,
+                                         v,
+                                         attn_scale,
+                                         bias,
+                                         block_mask,
+                                         use_alibi_mask,
+                                         use_padding_mask,
+                                         seq_len_q,
+                                         seq_len_kv,
+                                         actual_diagonal_alignment,
+                                         actual_left_bound,
+                                         actual_right_bound,
+                                         dropout,
+                                         rng_dump,
+                                         paged_attention_k_table,
+                                         paged_attention_v_table,
+                                         paged_attention_max_seq_len_kv,
+                                         compute_data_type,
+                                         name,
+                                         fn,
+                                         actual_generate_stats,
+                                         score_max,
+                                         score_sum_exp,
+                                         mma_core_mode,
+                                         descale_q,
+                                         descale_k,
+                                         descale_v,
+                                         descale_s,
+                                         scale_s,
+                                         scale_o,
+                                         implementation);
+
+    // Return {O, Stats} for backward compatibility
+    return {internal_result.O, internal_result.Stats};
+}
+
+std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 3>
+PyGraph::sdpa_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& o,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& dO,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& stats,
+                       py::object const& attn_scale,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& dBias,
+                       bool const use_alibi_mask,
+                       bool const use_padding_mask,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                       py::object const& max_total_seq_len_q,
+                       py::object const& max_total_seq_len_kv,
+                       bool const use_causal_mask,
+                       bool const use_causal_mask_bottom_right,
+                       py::object const& sliding_window,
+                       cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+                       py::object const& left_bound,
+                       py::object const& right_bound,
+                       py::object const& dropout,
+                       std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+                       bool const use_deterministic_algorithm,
+                       cudnn_frontend::DataType_t const& compute_data_type,
+                       std::string const& name) {
+    auto attributes =
+        cudnn_frontend::graph::SDPA_backward_attributes()
+            .set_bias(bias)
+            .set_dbias(dBias)
+            .set_alibi_mask(use_alibi_mask)
+            .set_padding_mask(use_padding_mask)
+            .set_seq_len_q(seq_len_q)
+            .set_seq_len_kv(seq_len_kv)
+            .set_diagonal_alignment(
+                diagonal_alignment)  // for backwards compatibility, this must be called prior to set_causal_mask_*
+            .set_causal_mask(use_causal_mask)
+            .set_causal_mask_bottom_right(use_causal_mask_bottom_right)
+            .set_deterministic_algorithm(use_deterministic_algorithm)
+            .set_compute_data_type(compute_data_type)
+            .set_name(name);
+
+    py::object cudnn_tensor_type = py::module_::import("cudnn").attr("tensor");
+
+    if (!attn_scale.is_none()) {
+        if (py::isinstance<py::float_>(attn_scale)) {
+            auto const attn_scale_value = attn_scale.cast<float>();
+            attributes.set_attn_scale(attn_scale_value);
+        } else {
+            auto const attn_scale_tensor = attn_scale.cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            if (!attn_scale_tensor) {
+                throw std::runtime_error("attn_scale must be a cudnn_tensor or float.");
+            }
+            attributes.set_attn_scale(attn_scale_tensor);
+        }
+    }
+
+    if (!max_total_seq_len_q.is_none()) {
+        int64_t const max_total_seq_len_q_value = max_total_seq_len_q.cast<int64_t>();
+        attributes.set_max_total_seq_len_q(max_total_seq_len_q_value);
+    }
+
+    if (!max_total_seq_len_kv.is_none()) {
+        int64_t const max_total_seq_len_kv_value = max_total_seq_len_kv.cast<int64_t>();
+        attributes.set_max_total_seq_len_kv(max_total_seq_len_kv_value);
+    }
+
+    if (!sliding_window.is_none()) {
+        if (py::isinstance<py::int_>(sliding_window)) {
+            int sliding_window_value = sliding_window.cast<int64_t>();
+            attributes.set_diagonal_band_left_bound(sliding_window_value);
+        } else {
+            throw std::runtime_error("sliding window must be an int (or None)");
+        }
+    }
+
+    if (!left_bound.is_none()) {
+        if (py::isinstance<py::int_>(left_bound)) {
+            attributes.set_diagonal_band_left_bound(left_bound.cast<int64_t>());
+        } else {
+            throw std::runtime_error("diagonal_band_left_bound must be an int (or None)");
+        }
+    }
+
+    if (!right_bound.is_none()) {
+        if (py::isinstance<py::int_>(right_bound)) {
+            attributes.set_diagonal_band_right_bound(right_bound.cast<int64_t>());
+        } else {
+            throw std::runtime_error("diagonal_band_right_bound must be an int (or None)");
+        }
+    }
+
+    if (!dropout.is_none()) {
+        if (!py::isinstance<py::tuple>(dropout)) {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor"
+                ", and an offset tensor) or (mask tensor, scale tensor)");
+        }
+        py::tuple dropout_tuple = dropout.cast<py::tuple>();
+        if (dropout_tuple.size() != 3) {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor"
+                ", and an offset tensor) or (mask tensor, scale tensor)");
+        }
+
+        if (py::isinstance<py::float_>(dropout_tuple[0]) && py::isinstance(dropout_tuple[1], cudnn_tensor_type) &&
+            py::isinstance(dropout_tuple[2], cudnn_tensor_type)) {
+            auto const probability = dropout_tuple[0].cast<float>();
+            auto const seed        = dropout_tuple[1].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const offset      = dropout_tuple[2].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            attributes.set_dropout(probability, seed, offset);
+            if (rng_dump) {
+                attributes.set_rng_dump(rng_dump);
+            }
+        } else if (py::isinstance(dropout_tuple[0], cudnn_tensor_type) &&
+                   py::isinstance(dropout_tuple[1], cudnn_tensor_type) &&
+                   py::isinstance(dropout_tuple[2], cudnn_tensor_type)) {
+            auto const mask      = dropout_tuple[0].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const scale     = dropout_tuple[1].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const scale_inv = dropout_tuple[2].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            attributes.set_dropout(mask, scale, scale_inv);
+        } else {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor"
+                ", and an offset tensor) or (mask tensor, scale tensor)");
+        }
+    }
+
+    auto [dQ, dK, dV] = graph->sdpa_backward(q, k, v, o, dO, stats, attributes);
+    return {dQ, dK, dV};
+}
+
+// Deprecated, use sdpa_unified instead
+std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 4>
+PyGraph::sdpa_fp8(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_k,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_v,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_s,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_s,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_o,
+                  py::object const& is_inference,
+                  py::object const& attn_scale,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& bias,
+                  bool const use_alibi_mask,
+                  bool const use_padding_mask,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                  bool const use_causal_mask,
+                  bool const use_causal_mask_bottom_right,
+                  py::object const& sliding_window,
+                  cudnn_frontend::DiagonalAlignment_t const& diagonal_alignment,
+                  py::object const& left_bound,
+                  py::object const& right_bound,
+                  py::object const& dropout,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& rng_dump,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_k_table,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& paged_attention_v_table,
+                  py::object const& paged_attention_max_seq_len_kv,
+                  cudnn_frontend::DataType_t const& compute_data_type,
+                  std::string const& name,
+                  std::optional<PyCallback> fn,
+                  py::object const& generate_stats,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_max,
+                  std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> score_sum_exp) {
+    cudnn_frontend::DataType_t mma_core_mode                             = cudnn_frontend::DataType_t::FP8_E4M3;
+    std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> block_mask = nullptr;
+
+    // Handle sliding_window to left_bound mapping for backward compatibility
+    py::object actual_left_bound = left_bound;
+    if (!sliding_window.is_none() && !left_bound.is_none()) {
+        throw std::runtime_error("sliding window and left_bound cannot be set at the same time");
+    }
+    if (!sliding_window.is_none()) {
+        actual_left_bound = sliding_window;
+    }
+
+    // Handle is_inference/generate_stats conversion for backward compatibility
+    py::object actual_generate_stats = generate_stats;
+    if (generate_stats.is_none() == is_inference.is_none()) {
+        throw std::runtime_error("Exactly one of {generate_stats, is_inference} must be set (prefer generate_stats).");
+    }
+    if (!is_inference.is_none()) {
+        if (py::isinstance<py::bool_>(is_inference)) {
+            actual_generate_stats = py::bool_(!is_inference.cast<bool>());
+        } else {
+            throw std::runtime_error("is_inference must be a bool.");
+        }
+    }
+
+    // Handle deprecated causal mask conversion for backward compatibility
+    cudnn_frontend::DiagonalAlignment_t actual_diagonal_alignment = diagonal_alignment;
+    py::object actual_right_bound                                 = right_bound;
+
+    if (use_causal_mask && use_causal_mask_bottom_right) {
+        throw std::runtime_error("use_causal_mask and use_causal_mask_bottom_right cannot both be true");
+    }
+
+    if (use_causal_mask && !right_bound.is_none()) {
+        throw std::runtime_error("use_causal_mask and diagonal_band_right_bound cannot be set at the same time");
+    }
+
+    if (use_causal_mask_bottom_right && !right_bound.is_none()) {
+        throw std::runtime_error(
+            "use_causal_mask_bottom_right and diagonal_band_right_bound cannot be set at the same time");
+    }
+
+    if (use_causal_mask) {
+        actual_diagonal_alignment = cudnn_frontend::DiagonalAlignment_t::TOP_LEFT;
+        actual_right_bound        = py::int_(0);
+    }
+
+    if (use_causal_mask_bottom_right) {
+        actual_diagonal_alignment = cudnn_frontend::DiagonalAlignment_t::BOTTOM_RIGHT;
+        actual_right_bound        = py::int_(0);
+    }
+
+    auto internal_result = sdpa_internal(q,
+                                         k,
+                                         v,
+                                         attn_scale,
+                                         bias,
+                                         block_mask,
+                                         use_alibi_mask,
+                                         use_padding_mask,
+                                         seq_len_q,
+                                         seq_len_kv,
+                                         actual_diagonal_alignment,
+                                         actual_left_bound,
+                                         actual_right_bound,
+                                         dropout,
+                                         rng_dump,
+                                         paged_attention_k_table,
+                                         paged_attention_v_table,
+                                         paged_attention_max_seq_len_kv,
+                                         compute_data_type,
+                                         name,
+                                         fn,
+                                         actual_generate_stats,
+                                         score_max,
+                                         score_sum_exp,
+                                         mma_core_mode,
+                                         descale_q,
+                                         descale_k,
+                                         descale_v,
+                                         descale_s,
+                                         scale_s,
+                                         scale_o);
+
+    // Return all 4 outputs as array for backward compatibility
+    return {internal_result.O, internal_result.Stats, internal_result.Amax_S, internal_result.Amax_O};
+}
+
+std::array<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, 7>
+PyGraph::sdpa_fp8_backward(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& q,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& k,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& v,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& o,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& dO,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& stats,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_q,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_k,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_v,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_o,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_dO,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_s,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& descale_dP,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_s,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dQ,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dK,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dV,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& scale_dP,
+                           py::object const& attn_scale,
+                           bool const use_padding_mask,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_q,
+                           std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>& seq_len_kv,
+                           bool const use_causal_mask,
+                           bool const use_causal_mask_bottom_right,
+                           bool const use_deterministic_algorithm,
+                           py::object const& dropout,
+                           cudnn_frontend::DataType_t const& compute_data_type,
+                           std::string const& name) {
+    auto attributes = cudnn_frontend::graph::SDPA_fp8_backward_attributes()
+                          .set_padding_mask(use_padding_mask)
+                          .set_seq_len_q(seq_len_q)
+                          .set_seq_len_kv(seq_len_kv)
+                          .set_causal_mask(use_causal_mask)
+                          .set_causal_mask_bottom_right(use_causal_mask_bottom_right)
+                          .set_deterministic_algorithm(use_deterministic_algorithm)
+                          .set_compute_data_type(compute_data_type)
+                          .set_name(name);
+
+    if (!attn_scale.is_none()) {
+        if (py::isinstance<py::float_>(attn_scale)) {
+            auto const attn_scale_value = attn_scale.cast<float>();
+            attributes.set_attn_scale(attn_scale_value);
+        } else {
+            auto const attn_scale_tensor = attn_scale.cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            if (!attn_scale_tensor) {
+                throw std::runtime_error("attn_scale must be a cudnn_tensor or float.");
+            }
+            attributes.set_attn_scale(attn_scale_tensor);
+        }
+    }
+
+    py::object cudnn_tensor_type = py::module_::import("cudnn").attr("tensor");
+
+    if (!dropout.is_none()) {
+        if (!py::isinstance<py::tuple>(dropout)) {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor"
+                ", and an offset tensor) or (mask tensor, scale tensor)");
+        }
+        py::tuple dropout_tuple = dropout.cast<py::tuple>();
+        if (dropout_tuple.size() != 3) {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor"
+                ", and an offset tensor) or (mask tensor, scale tensor)");
+        }
+
+        if (py::isinstance<py::float_>(dropout_tuple[0]) && py::isinstance(dropout_tuple[1], cudnn_tensor_type) &&
+            py::isinstance(dropout_tuple[2], cudnn_tensor_type)) {
+            auto const probability = dropout_tuple[0].cast<float>();
+            auto const seed        = dropout_tuple[1].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const offset      = dropout_tuple[2].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            attributes.set_dropout(probability, seed, offset);
+        } else if (py::isinstance(dropout_tuple[0], cudnn_tensor_type) &&
+                   py::isinstance(dropout_tuple[1], cudnn_tensor_type) &&
+                   py::isinstance(dropout_tuple[2], cudnn_tensor_type)) {
+            auto const mask      = dropout_tuple[0].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const scale     = dropout_tuple[1].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            auto const scale_inv = dropout_tuple[2].cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>>();
+            attributes.set_dropout(mask, scale, scale_inv);
+        } else {
+            throw std::runtime_error(
+                "dropout must be a tuple of (float probability, a seed tensor"
+                ", and an offset tensor) or (mask tensor, scale tensor)");
+        }
+    }
+
+    auto [dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP] = graph->sdpa_fp8_backward(q,
+                                                                                     k,
+                                                                                     v,
+                                                                                     o,
+                                                                                     dO,
+                                                                                     stats,
+                                                                                     descale_q,
+                                                                                     descale_k,
+                                                                                     descale_v,
+                                                                                     descale_o,
+                                                                                     descale_dO,
+                                                                                     descale_s,
+                                                                                     descale_dP,
+                                                                                     scale_s,
+                                                                                     scale_dQ,
+                                                                                     scale_dK,
+                                                                                     scale_dV,
+                                                                                     scale_dP,
+                                                                                     attributes);
+    return {dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP};
+}
+
+void
+init_pygraph_sdpa_submodule(py::class_<PyGraph>& m) {
+    m.def("sdpa",
+          &PyGraph::sdpa,
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg_v("is_inference", py::none()),
+          py::arg_v("attn_scale", py::none()),
+          py::arg_v("bias", nullptr),
+          py::arg_v("block_mask", nullptr),
+          py::arg_v("use_alibi_mask", false),
+          py::arg_v("use_padding_mask", false),
+          py::arg_v("seq_len_q", nullptr),
+          py::arg_v("seq_len_kv", nullptr),
+          py::arg_v("use_causal_mask", false),
+          py::arg_v("use_causal_mask_bottom_right", false),
+          py::arg_v("sliding_window_length", py::none()),
+          py::arg_v("diagonal_alignment", cudnn_frontend::DiagonalAlignment_t::TOP_LEFT),
+          py::arg_v("diagonal_band_left_bound", py::none()),
+          py::arg_v("diagonal_band_right_bound", py::none()),
+          py::arg_v("dropout", py::none()),
+          py::arg_v("rng_dump", nullptr),
+          py::arg_v("paged_attention_k_table", py::none()),
+          py::arg_v("paged_attention_v_table", py::none()),
+          py::arg_v("paged_attention_max_seq_len_kv", py::none()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          py::arg_v("score_mod", std::nullopt),
+          py::arg_v("generate_stats", py::none()),
+          py::arg_v("implementation", cudnn_frontend::AttentionImplementation_t::AUTO),
+          py::arg_v("score_max", nullptr),
+          py::arg_v("score_sum_exp", nullptr),
+          R"pbdoc(
+                Perform scaled dot product attention.
+
+                Args:
+                    q (cudnn_tensor): The query data.
+                    k (cudnn_tensor): The key data. When page_table_k is provided, 'k' is a container of non-contiguous key data.
+                    v (cudnn_tensor): The value data. When page_table_v is provided, 'v' is a container of non-contiguous value data.
+                    attn_scale (Optional[Union[float, cudnn_tensor]]): The scale factor for attention. Default is None.
+                    bias (Optional[cudnn_tensor]): The bias data for attention. Default is None.
+                    use_alibi_mask (Optional[bool]): Whether to use alibi mask. Default is False.
+                    use_padding_mask (Optional[bool]): Whether to use padding mask. Default is False.
+                    seq_len_q (Optional[cudnn_tensor]): The sequence length of the query.
+                    seq_len_kv (Optional[cudnn_tensor]): The sequence length of the key.
+                    dropout (Optional[Union[Tuple[(probability: float, seed: cudnn_tensor, offset: cudnn_tensor)], Tuple[mask: cudnn_tensor, scale: cudnn_tensor]]]): Whether to do dropout. Default is None.
+                    rng_dump (Optional[cudnn_tensor]): Debug tensor to dump the Philox RNG dropout mask. Default is None.
+                    paged_attention_k_table (Optional[cudnn_tensor]): The page table to look up offsets into 'k'
+                    paged_attention_v_table (Optional[cudnn_tensor]): The page table to look up offsets into 'v'
+                    paged_attention_max_seq_len_kv (Optional[integer]): The maximum sequence length for k/v caches when paged attention is active.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): The name of the operation.
+                    generate_stats (Optional[bool]): If true, compute and output softmax stats (useful at training time). Default is None, but one of {generate_stats, is_inference} must be set.   
+                    implementation (Optional[cudnn.attention_implementation]): Which underlying implementation to use in the cuDNN backend. Default is AUTO (recommended).              
+                    score_max (Optional[cudnn_tensor]): The max of attention score.
+                    score_sum_exp (Optional[cudnn_tensor]): The numerically stable sum of exponents using normalized values wrt max score.
+                Preferred masking Args:
+                    diagonal_alignment (Optional[cudnn.diagonal_alignment]): One of {"TOP_LEFT", "BOTTOM_RIGHT"}. E.g., causal masking can be performed by setting diagonal_alignment=TOP_LEFT, and diagonal_band_right_bound=0. Default is TOP_LEFT.
+                    diagonal_band_left_bound (Optional[int]): An integer >= 1 specifying the offset to the left of the main diagonal to attend to. Default is None, implying +Inf.
+                    diagonal_band_right_bound (Optional[int]): An integer >= 0 specifying the offset to the right of the main diagonal to attend to. Default is None, implying +Inf.
+                Deprecated masking Args (can cause undetermined behavior when combined with the Preferred masking args):
+                    sliding_window_length (Optional[int]): A positive int specifying the left bound sliding window length
+                    use_causal_mask (Optional[bool]): Whether to use causal mask. Default is False.
+                    use_causal_mask_bottom_right (Optional[bool]): Whether to use bottom right aligned causal mask. Default is False.
+                Other deprecated Args:
+                    is_inference (Optional[bool]): If false, compute and output softmax stats. Prefer generate_stats instead (NOTE: generate_stats takes the negation of the argument to is_inference).
+                Experimental Args:
+                    implementation (Optional[cudnn.attention_implementation]): One of {"AUTO", "COMPOSITE", "UNIFIED"}. Almost all users should use "AUTO" (the default).
+
+                Returns:
+                    o (cudnn_tensor): The output data.
+                    stats (Optional[cudnn_tensor]): The softmax statistics in case the operation is in a training step.
+            )pbdoc");
+    m.def("sdpa_backward",
+          &PyGraph::sdpa_backward,
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("o"),
+          py::arg("dO"),
+          py::arg("stats"),
+          py::arg_v("attn_scale", py::none()),
+          py::arg_v("bias", nullptr),
+          py::arg_v("dBias", nullptr),
+          py::arg_v("use_alibi_mask", false),
+          py::arg_v("use_padding_mask", false),
+          py::arg_v("seq_len_q", nullptr),
+          py::arg_v("seq_len_kv", nullptr),
+          py::arg_v("max_total_seq_len_q", py::none()),
+          py::arg_v("max_total_seq_len_kv", py::none()),
+          py::arg_v("use_causal_mask", false),
+          py::arg_v("use_causal_mask_bottom_right", false),
+          py::arg_v("sliding_window_length", py::none()),
+          py::arg_v("diagonal_alignment", cudnn_frontend::DiagonalAlignment_t::TOP_LEFT),
+          py::arg_v("diagonal_band_left_bound", py::none()),
+          py::arg_v("diagonal_band_right_bound", py::none()),
+          py::arg_v("dropout", py::none()),
+          py::arg_v("rng_dump", nullptr),
+          py::arg_v("use_deterministic_algorithm", false),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+                Compute the key, query, value gradients of scaled dot product attention.
+
+                Args:
+                    q (cudnn_tensor): The query data.
+                    k (cudnn_tensor): The key data.
+                    v (cudnn_tensor): The value data.
+                    o (cudnn_tensor): The output data.
+                    dO (cudnn_tensor): The output loss gradient.
+                    stats (cudnn_tensor): The softmax statistics from the forward pass.
+                    attn_scale (Optional[Union[float, cudnn_tensor]]): The scale factor for attention. Default is None.
+                    bias (Optional[cudnn_tensor]): The bias data for attention. Default is None.
+                    dBias (Optional[cudnn_tensor]): The dBias data for attention. Default is None.
+                    use_alibi_mask (Optional[bool]): Whether to use alibi mask. Default is False.
+                    use_padding_mask (Optional[bool]): Whether to use padding mask. Default is False.
+                    seq_len_q (Optional[cudnn_tensor]): The sequence length of the query.
+                    seq_len_kv (Optional[cudnn_tensor]): The sequence length of the key.
+                    max_total_seq_len_q (Optional[int]): The maximum number of query sequence tokens for all batches, used for workspace allocation,
+                    max_total_seq_len_kv (Optional[int]): The maximum number of key/value sequence tokens for all batches, used for workspace allocation,
+                    dropout (Optional[Union[Tuple[(probability: float, seed: cudnn_tensor, offset: cudnn_tensor)], Tuple[mask: cudnn_tensor, scale: cudnn_tensor]]]): Whether to do dropout. Default is None.
+                    rng_dump (Optional[cudnn_tensor]): Debug tensor to dump the Philox RNG dropout mask. Default is None.
+                    use_deterministic_algorithm (Optional[bool]): Whether to always use deterministic algorithm. Default is False.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): The name of the operation.
+                Preferred masking Args:
+                    diagonal_alignment (Optional[cudnn.diagonal_alignment]): One of {"TOP_LEFT", "BOTTOM_RIGHT"}. E.g., causal masking can be performed by setting diagonal_alignment=TOP_LEFT, and diagonal_band_right_bound=0. Default is TOP_LEFT.
+                    diagonal_band_left_bround (Optional[int]): An integer >= 1 specifying the offset to the left of the main diagonal to attend to. Default is None, implying +Inf.
+                    diagonal_band_right_bound (Optional[int]): An integer >= 0 specifying the offset to the right of the main diagonal to attend to. Default is None, implying +Inf.
+                Deprecated masking Args (can cause undetermined behavior when combined with the Preferred masking args):
+                    sliding_window_length (Optional[int]): A positive int specifying the left bound sliding window length
+                    use_causal_mask (Optional[bool]): Whether to use causal mask. Default is False.
+                    use_causal_mask_bottom_right (Optional[bool]): Whether to use bottom right aligned causal mask. Default is False.
+
+                Returns:
+                    dQ (cudnn_tensor): The query gradient data.
+                    dK (cudnn_tensor): The key gradient data.
+                    dV (cudnn_tensor): The value gradient data.
+            )pbdoc");
+    m.def("sdpa_fp8",
+          &PyGraph::sdpa_fp8,
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("descale_q"),
+          py::arg("descale_k"),
+          py::arg("descale_v"),
+          py::arg("descale_s"),
+          py::arg("scale_s"),
+          py::arg("scale_o"),
+          py::arg_v("is_inference", py::none()),
+          py::arg_v("attn_scale", py::none()),
+          py::arg_v("bias", nullptr),
+          py::arg_v("use_alibi_mask", false),
+          py::arg_v("use_padding_mask", false),
+          py::arg_v("seq_len_q", nullptr),
+          py::arg_v("seq_len_kv", nullptr),
+          py::arg_v("use_causal_mask", false),
+          py::arg_v("use_causal_mask_bottom_right", false),
+          py::arg_v("sliding_window", py::none()),
+          py::arg_v("diagonal_alignment", cudnn_frontend::DiagonalAlignment_t::TOP_LEFT),
+          py::arg_v("left_bound", py::none()),
+          py::arg_v("right_bound", py::none()),
+          py::arg_v("dropout", py::none()),
+          py::arg_v("rng_dump", nullptr),
+          py::arg_v("paged_attention_k_table", nullptr),
+          py::arg_v("paged_attention_v_table", nullptr),
+          py::arg_v("paged_attention_max_seq_len_kv", py::none()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          py::arg_v("fn", std::nullopt),
+          py::arg_v("generate_stats", py::none()),
+          py::arg_v("score_max", nullptr),
+          py::arg_v("score_sum_exp", nullptr),
+          R"pbdoc(
+                Perform scaled dot product attention with fp8 datatype inputs and outputs.
+
+                Args:
+                    q (cudnn_tensor): The query data.
+                    k (cudnn_tensor): The key data. When paged_attention_k_table is provided, 'k' is a container of non-contiguous key data.
+                    v (cudnn_tensor): The value data. When paged_attention_v_table is provided, 'v' is a container of non-contiguous value data.
+                    descale_q (cudnn_tensor): Descale factor for query.
+                    descale_k (cudnn_tensor): Descale factor for key.
+                    descale_v (cudnn_tensor): Descale factor for value.
+                    descale_s (cudnn_tensor): Descale factor for S tensor.
+                    scale_s (cudnn_tensor): Scale factor for S tensor.
+                    scale_o (cudnn_tensor): Scale factor for output.
+                    attn_scale (Optional[Union[float, cudnn_tensor]]): The scale factor for attention. Default is None.
+                    bias (Optional[cudnn_tensor]): The bias data for attention. Default is None.
+                    use_alibi_mask (Optional[bool]): Whether to use alibi mask. Default is False.
+                    use_padding_mask (Optional[bool]): Whether to use padding mask. Default is False.
+                    seq_len_q (Optional[cudnn_tensor]): The sequence length of the query.
+                    seq_len_kv (Optional[cudnn_tensor]): The sequence length of the key.
+                    dropout (Optional[Union[Tuple[(probability: float, seed: cudnn_tensor, offset: cudnn_tensor)], Tuple[mask: cudnn_tensor, scale: cudnn_tensor]]]): Whether to do dropout. Default is None.
+                    rng_dump (Optional[cudnn_tensor]): Debug tensor to dump the Philox RNG dropout mask. Default is None.
+                    paged_attention_k_table (Optional[cudnn_tensor]): The page table to look up offsets into 'k'. Default is None.
+                    paged_attention_v_table (Optional[cudnn_tensor]): The page table to look up offsets into 'v'. Default is None.
+                    paged_attention_max_seq_len_kv (Optional[int]): The maximum sequence length for k/v caches when paged attention is active. Default is None.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): The name of the operation.
+                    fn (Optional[callable]): An optional callback function for attention score modification. Default is None.
+                    generate_stats (Optional[bool]): If true, compute and output softmax stats (useful at training time). Default is None, but one of {generate_stats, is_inference} must be set.
+                    score_max (Optional[cudnn_tensor]): The max of attention score.
+                    score_sum_exp (Optional[cudnn_tensor]): The numerically stable sum of exponents using normalized values wrt max score.
+                Preferred masking Args:
+                    diagonal_alignment (Optional[cudnn.diagonal_alignment]): One of {"TOP_LEFT", "BOTTOM_RIGHT"}. E.g., causal masking can be performed by setting diagonal_alignment=TOP_LEFT, and right_bound=0. Default is TOP_LEFT.
+                    left_bound (Optional[int]): An integer >= 1 specifying the offset to the left of the main diagonal to attend to. Default is None, implying +Inf.
+                    right_bound (Optional[int]): An integer >= 0 specifying the offset to the right of the main diagonal to attend to. Default is None, implying +Inf.
+                Deprecated masking Args (can cause undetermined behavior when combined with the Preferred masking args):
+                    sliding_window (Optional[int]): A positive int specifying the left bound sliding window length. Default is None.
+                    use_causal_mask (Optional[bool]): Whether to use causal mask. Default is False.
+                    use_causal_mask_bottom_right (Optional[bool]): Whether to use bottom right aligned causal mask. Default is False.
+                Deprecated Args:
+                    is_inference (Optional[bool]): If false, compute and output softmax stats. Prefer generate_stats instead (NOTE: generate_stats takes the negation of the argument to is_inference).
+
+                Returns:
+                    o (cudnn_tensor): The output data.
+                    stats (Optional[cudnn_tensor]): The softmax statistics in case the operation is in a training step.
+                    amax_s (cudnn_tensor): The absolute maximum of S tensor.
+                    amax_o (cudnn_tensor): The absolute maximum of output tensor.
+            )pbdoc");
+    m.def("sdpa_fp8_backward",
+          &PyGraph::sdpa_fp8_backward,
+          py::arg("q"),
+          py::arg("k"),
+          py::arg("v"),
+          py::arg("o"),
+          py::arg("dO"),
+          py::arg("stats"),
+          py::arg("descale_q"),
+          py::arg("descale_k"),
+          py::arg("descale_v"),
+          py::arg("descale_o"),
+          py::arg("descale_dO"),
+          py::arg("descale_s"),
+          py::arg("descale_dP"),
+          py::arg("scale_s"),
+          py::arg("scale_dQ"),
+          py::arg("scale_dK"),
+          py::arg("scale_dV"),
+          py::arg("scale_dP"),
+          py::arg_v("attn_scale", py::none()),
+          py::arg_v("use_padding_mask", false),
+          py::arg_v("seq_len_q", nullptr),
+          py::arg_v("seq_len_kv", nullptr),
+          py::arg_v("use_causal_mask", false),
+          py::arg_v("use_causal_mask_bottom_right", false),
+          py::arg_v("use_deterministic_algorithm", false),
+          py::arg_v("dropout", py::none()),
+          py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
+          py::arg_v("name", ""),
+          R"pbdoc(
+                Compute the key, query, value gradients of scaled dot product attention with fp8 datatype inputs and outputs.
+
+                Args:
+                    q (cudnn_tensor): The query data.
+                    k (cudnn_tensor): The key data.
+                    v (cudnn_tensor): The value data.
+                    o (cudnn_tensor): The output data.
+                    dO (cudnn_tensor): The output gradient data.
+                    stats (cudnn_tensor): The softmax statistics in case the operation is in a training step.
+                    descale_q (cudnn_tensor): Descale factor for query.
+                    descale_k (cudnn_tensor): Descale factor for key.
+                    descale_v (cudnn_tensor): Descale factor for value.
+                    descale_o (cudnn_tensor): Descale factor for output.
+                    descale_dO (cudnn_tensor): Descale factor for output gradient.
+                    descale_s (cudnn_tensor): Descale factor for S tensor.
+                    descale_dP (cudnn_tensor): Descale factor for P gradient tensor.
+                    scale_s (cudnn_tensor): Scale factor for S tensor.
+                    scale_dQ (cudnn_tensor): Scale factor for query gradient.
+                    scale_dK (cudnn_tensor): Scale factor for key gradient.
+                    scale_dV (cudnn_tensor): Scale factor for value gradient.
+                    scale_dP (cudnn_tensor): Scale factor for dP gradient.
+                    attn_scale (Optional[Union[float, cudnn_tensor]]): The scale factor for attention. Default is None.
+                    use_padding_mask (bool): Whether it is an inference step or training step.
+                    seq_len_q (Optional[cudnn_tensor]): The sequence length of the query.
+                    seq_len_kv (Optional[cudnn_tensor]): The sequence length of the key.
+                    use_causal_mask (Optional[bool]): Whether to use causal mask. Default is False.
+                    use_causal_mask_bottom_right (Optional[bool]): Whether to use bottom right aligned causal mask. Default is False.
+                    use_deterministic_algorithm (Optional[bool]): Whether to always use deterministic algorithm. Default is False.
+                    dropout (Optional[Union[Tuple[(probability: float, seed: cudnn_tensor, offset: cudnn_tensor)], Tuple[mask: cudnn_tensor, scale: cudnn_tensor]]]): Whether to do dropout. Default is None.
+                    compute_data_type (Optional[cudnn.data_type]): The data type for computation. Default is NOT_SET.
+                    name (Optional[str]): The name of the operation.
+
+                Returns:
+                    dQ (cudnn_tensor): The query gradient data.
+                    dK (cudnn_tensor): The key gradient data.
+                    dV (cudnn_tensor): The value gradient data.
+                    amax_dQ (cudnn_tensor): The absolute maximum of query gradient tensor.
+                    amax_dK (cudnn_tensor): The absolute maximum of key gradient tensor.
+                    amax_dV (cudnn_tensor): The absolute maximum of value gradient tensor.
+                    amax_dP (cudnn_tensor): The absolute maximum of dP tensor.
+            )pbdoc");
+    m.attr("scaled_dot_product_flash_attention")          = m.attr("sdpa");
+    m.attr("scaled_dot_product_flash_attention_backward") = m.attr("sdpa_backward");
+}
+
+}  // namespace cudnn_frontend::python_bindings
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/requirements.txt b/third_party/cudnn-frontend/requirements.txt
new file mode 100644
index 00000000..0860f8c5
--- /dev/null
+++ b/third_party/cudnn-frontend/requirements.txt
@@ -0,0 +1,8 @@
+jupyter
+numpy<2.0.0
+pybind11[global]
+pytest
+pytest-xdist
+looseversion
+black==26.1.0
+clang-format==21.1.6
diff --git a/third_party/cudnn-frontend/samples/CMakeLists.txt b/third_party/cudnn-frontend/samples/CMakeLists.txt
new file mode 100644
index 00000000..61845024
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/CMakeLists.txt
@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.18)
+
+find_package(Threads)
+
+find_package(Catch2 QUIET)
+if(NOT Catch2_FOUND)
+    include(FetchContent)
+
+    # Fetch and build catch2
+    FetchContent_Declare(
+        Catch2
+        GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+        GIT_TAG        v3.3.2
+    )
+    FetchContent_MakeAvailable(Catch2)
+endif()
+
+include(${PROJECT_SOURCE_DIR}/cmake/cuDNN.cmake)
+
+if(DEFINED ENV{NO_DEFAULT_IN_SWITCH})
+    message("Default case in the switch is disabled")
+    add_compile_definitions(NO_DEFAULT_IN_SWITCH)
+endif()
+
+# Add subdirectories for samples and legacy_samples
+add_subdirectory(cpp)
+add_subdirectory(legacy_samples)
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/README.md b/third_party/cudnn-frontend/samples/README.md
new file mode 100644
index 00000000..f78c5f0f
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/README.md
@@ -0,0 +1,184 @@
+# FE - Programming Samples
+
+## Python Interface Samples
+Samples leveraging FE's Python interface are located in [samples/python](python/).
+* [01_epilogue](python/01_matmul_bias.ipynb)
+    Shows how to fuse elementwise functions to a GEMM graph.
+
+* [02_serialization](python/02_sdpa_graph_serialization.ipynb)
+    Shows how to serialize and deserialize a graph for future execution.
+
+* [03_mixed_precision](python/03_mixed_precision_matmul.ipynb)
+    Shows how to mutiply tensors of different data types.
+
+* [23_layer_norm](python/23_layernorm_with_pointwise_add_fusion.ipynb)
+    Shows how to run pointwise add and layer norm fusion with intermediate bfloat16 output.
+
+* [26_layer_norm](python/26_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb)
+    Shows how to use layer norm with fusion pattern for relu using a bitmask.
+
+* [50_sdpa](python/50_scaled_dot_product_attention.ipynb)
+    Shows how to run causal self attention with dropout in forward pass.
+
+* [51_sdpa](python/51_scaled_dot_product_attention_backward.ipynb)
+    Shows how to run causal self attention in bprop.
+
+* [52_sdpa](python/52_scaled_dot_product_attention_with_paged_caches.ipynb)
+    Shows how to run scaled dot product attention (prefill phase) where the K and V caches are stored in non contiguous memory.
+
+* [53_sdpa](python/53_scaled_dot_product_attention_decode_with_paged_caches.ipynb)
+    Shows how to run scaled dot product attention (decode phase) where the K and V caches are stored in non contiguous memory.
+
+## C++ Interface Samples
+Samples leveraging FE's C++ interface are located in [samples/cpp](cpp/).
+
+### Building the samples
+
+```
+mkdir build
+cd build
+cmake -DCUDNN_PATH=/path/to/cudnn -DCUDAToolkit_ROOT=/path/to/cuda  ../
+cmake --build . -j16
+bin/samples
+```
+
+To run a single sample, for eg. `TEST_CASE("Cached sdpa", "[graph][sdpa][flash]")`
+
+```
+./bin/samples "Cached sdpa"
+```
+
+### Scaled dot product attention SDPA examples
+
+##### [samples/cpp/sdpa](cpp/sdpa) shows how to use cudnn's sdpa operation.
+
+- [Cached SDPA](cpp/sdpa/fp16_cached.cpp)
+
+Users are expected to build a graph once and then execute it multiple times. This example shows how to cache cudnn sdpa graph building. 
+
+- [Fwd SDPA](cpp/sdpa/fp16_fwd.cpp) and [Bwd SDPA](cpp/sdpa/fp16_bwd.cpp)
+
+cudnn's sdpa operation enables various customizations on itself. These examples show how to build a graph with sdpa operation for your own custom sdpa needs.
+
+- [Fwd SDPA with paged caches](cpp/sdpa/fp16_fwd_with_paged_caches.cpp)
+
+Similar to [Fwd SDPA](cpp/sdpa/fp16_fwd.cpp), but here with the ability to use non contiguous K and V caches in combination with page tables, as described in the [PagedAttention paper](https://arxiv.org/abs/2309.06180).
+
+- [Fwd FP8 SDPA](cpp/sdpa/fp8_fwd.cpp) and [Bwd SDPA](cpp/sdpa/fp8_bwd.cpp)
+
+Extends the sdpa sample to fp8 precision.
+
+- [Fwd SDPA with CUDA graph](cpp/sdpa/fp16_fwd_with_cudagraphs.cpp)
+
+Demonstrates the building and execution of a CUDA graph representing the SDPA operation, followed by the update (and another execution) of the CUDA graph with new variant pointers.
+
+### Convolution fusion examples
+
+##### [samples/cpp/convolution](cpp/convolution/) shows how to use cudnn fprop, dgrad, wgrad operation and some fusions with them.
+
+- [Fprop](cpp/convolution/fprop.cpp)
+
+Showcases a simple fprop, fprop with pointwise fusion of scale bias and relu, fprop with bias and relu for channels first layout and fusions before convolution in the form of scale bias relu conv and stats.  Also epilogue fusion of concatenate.
+
+- [Fp8 fprop](cpp/convolution/fp8_fprop.cpp)
+
+Showcases fp8 convolution with scaling and amax reduction.
+
+- [Int8 fprop](cpp/convolution/int8_fprop.cpp)
+
+Showcases Int8 convolution.
+
+- [Dgrad](cpp/convolution/dgrads.cpp)
+
+Has samples for simple dgrad, fusion for dgrad + drelu and Dgrad + Drelu + DBNweight fused operation.
+
+- [Wgrad](cpp/convolution/wgrads.cpp)
+
+Similar to dgrad was simple wgrad and scale+bias+relu+wgrad fused operation.
+
+### Matmul fusion examples
+
+##### [Matmul](cpp/matmul/) showcases different matmul samples.
+
+- [Matmul fusion](cpp/matmul/matmuls.cpp) 
+
+Has samples for simple Matmul, matmul fusions like matmul+abs, matmul+bias and matmul+scale+bias+relu operation.
+
+- [Fp8 Matmul](cpp/matmul/fp8_matmul.cpp)
+
+Showcases fp8 matmul with scaling and amax reduction.
+
+- [Int8 Matmul](cpp/matmul/int8_matmul.cpp)
+
+Showcases Int8 mamtul.
+
+- [Mixed precision matmul](cpp/matmul/mixed_matmul.cpp)
+
+Mixed precision multiplication between int8 and bf16 data-type with int8 operand being upcasted to bf16
+
+### Normaliization examples
+
+##### [Norm](cpp/norm/) showcases different matmul samples.
+
+- [LayerNorm](cpp/norm/layernorm.cpp)
+
+Eg for layernorm training, inference and back propagation
+
+- [AdaLayerNorm](cpp/norm/adalayernorm.cpp)
+
+Eg for adaptive layernorm training, inference and back propagation
+
+- [RMSNorm](cpp/norm/layernorm.cpp)
+
+Eg for rmsnorm training, inference and back propagation
+
+- [BatchNorm](cpp/norm/batchnorm.cpp)
+
+Shows different fusions in batch norm fprop and bprop. And split batch norm fusions.
+
+- [Block scale quantize](cpp/norm/norm_block_scale.cpp)
+
+Showcases normalization with block scale quantize epilogue fusion.
+
+- [Norm zero centered gamma](cpp/norm/norm_zero_centered_gamma.cpp)
+
+Showcases layer normalization with zero centered gamma usage.
+
+- [Layer norm with bitmask relu](cpp/norm/layernorm_bitmask_relu.cpp)
+
+Showcases layer normalization and relu with bitmask.
+
+### Miscellaneous examples
+
+##### [Misc](cpp/misc/) Miscellaneous samples
+
+- [Pointwise fusions](cpp/misc/pointwise.cpp)
+
+pointwise fusions with scalar are shown in this sample.
+
+- [Resample](cpp/misc/resample.cpp)
+
+resample fprop operation with different resampling modes.
+
+- [Serialization](cpp/misc/serialization.cpp)
+
+How to serialize a graph into a file and read it back on another thread/process. 
+
+- [Autotuning](cpp/misc/autotuning.cpp)
+
+How to choose the best performing plan among multiple plans suggested by the heuristics.
+
+- [Cuda Graphs](cpp/misc/cudagraphs.cpp)
+
+Shows how to use the native cuda graph API. The samples show how to create cudnn's cuda graph, and how to repeatedly update it with new device buffers for multiple execution.
+
+- [SM Carveout](cpp/misc/sm_carveout.cpp)
+
+Showcases a Batch norm example, where only a partial number of SMs participate in executing the kernel.
+
+- [Deviceless ahead-of-time compilation](cpp/misc/deviceless_aot_compilation.cpp)
+
+Showcases how to do deviceless ahead-of-time compilation with the device property descriptor (instead of a cuDNN handle).
+
+## [Deprecated] C++ v0.x Interface Samples
+Samples leveraging FE's C++ 0.x interface are located in [samples/legacy_samples](legacy_samples/).
diff --git a/third_party/cudnn-frontend/samples/cpp/CMakeLists.txt b/third_party/cudnn-frontend/samples/cpp/CMakeLists.txt
new file mode 100644
index 00000000..48d23fbb
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/CMakeLists.txt
@@ -0,0 +1,105 @@
+# target sources
+add_executable(
+    samples
+
+    sdpa/fp16_fwd.cpp
+    sdpa/fp16_bwd.cpp
+    sdpa/fp16_cached.cpp
+    sdpa/fp16_benchmark.cpp
+    sdpa/fp16_fwd_with_flexible_graphs.cpp
+    sdpa/fp16_bwd_with_flexible_graphs.cpp
+    sdpa/fp16_fwd_with_custom_dropout.cpp
+    sdpa/fp16_fwd_with_paged_caches.cpp
+    sdpa/fp16_dynamic_shapes.cpp
+    sdpa/fp16_fwd_paged_decode_and_prefill.cpp
+    sdpa/fp16_fwd_with_cudagraphs.cpp
+    sdpa/fp16_bwd_with_cudagraphs.cpp
+    sdpa/fp8_fwd.cpp
+    sdpa/fp8_fwd_current_scaling.cpp
+    sdpa/fp8_bwd_with_current_scaling.cpp
+    sdpa/fp8_bwd.cpp
+    sdpa/fp8_fwd_bottom_right_causal_mask.cpp
+    sdpa/fp8_bwd_bottom_right_causal_mask.cpp
+    sdpa/fp16_fwd_with_sink_token.cpp
+    sdpa/fp16_bwd_with_sink_token.cpp
+    sdpa/fp16_fwd_with_max_and_sum_exp.cpp
+    sdpa/fp16_fwd_with_block_mask.cpp
+
+    convolution/fprop.cpp
+    convolution/fp8_fprop.cpp
+    convolution/int8_fprop.cpp
+    convolution/dgrads.cpp
+    convolution/wgrads.cpp
+    convolution/conv_dynamic_shape_benchmark.cpp
+
+    matmul/matmuls.cpp
+    matmul/fp8_matmul.cpp
+    matmul/int8_matmul.cpp
+    matmul/mixed_matmul.cpp
+    matmul/blackwell_nvfp4_mxfp8_block_scale_matmul.cpp
+    matmul/general_block_scale_matmul.cpp
+    matmul/complex_fp32_matmul.cpp
+
+    moe_grouped_matmul/moe_grouped_matmul.cpp
+
+    norm/batchnorm.cpp
+    norm/layernorm.cpp
+    norm/adaptive_layernorm.cpp
+    norm/norm_zero_centered_gamma.cpp
+    norm/rmsnorm.cpp
+    norm/norm_block_scale.cpp
+    norm/layernorm_bitmask_relu.cpp
+
+    misc/serialization.cpp
+    misc/autotuning.cpp
+    misc/custom_plan.cpp
+    misc/parallel_compilation.cpp
+    misc/pointwise.cpp
+    misc/resample.cpp
+    misc/slice.cpp
+    misc/sm_carveout.cpp
+    misc/cudagraphs.cpp
+    misc/deviceless_aot_compilation.cpp
+)
+
+# target flags
+if(MSVC)
+    target_compile_options(
+        samples PRIVATE
+        /W4 /WX # warning level 3 and all warnings as errors
+        /wd4100 # allow unused parameters
+        /wd4458 # local hides class member (currently a problem for all inline setters)
+        /wd4505 # unreferenced function with internal linkage has been removed
+        /wd4101 /wd4189 # unreferenced local
+        /bigobj # increase number of sections in .Obj file
+    )
+else()
+    target_compile_options(
+        samples PRIVATE
+        -Wall
+        -Wextra
+        -Werror
+        -Wno-unused-function
+    )
+endif()
+
+# target links
+target_link_libraries(
+    samples PRIVATE
+    Threads::Threads
+    Catch2::Catch2WithMain
+    cudnn_frontend
+    _cudnn_frontend_pch
+    CUDNN::cudnn
+
+    CUDA::cublasLt
+    CUDA::cudart
+    CUDA::cuda_driver # Needed as calls all CUDA calls will eventually move to driver
+    CUDA::nvrtc
+)
+
+# target cmake properties
+set_target_properties(
+    samples PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin
+)
diff --git a/third_party/cudnn-frontend/samples/cpp/convolution/conv_dynamic_shape_benchmark.cpp b/third_party/cudnn-frontend/samples/cpp/convolution/conv_dynamic_shape_benchmark.cpp
new file mode 100644
index 00000000..820d2f17
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/convolution/conv_dynamic_shape_benchmark.cpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+struct conv_shape_params {
+    int64_t n, c, h, w, k, r, s;
+};
+
+auto
+create_conv_relu_forward_graph(conv_shape_params conv_shape, const std::shared_ptr<fe::KernelCache> &kernel_cache) {
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::HALF)
+        .set_compute_data_type(fe::DataType_t::FLOAT)
+        .set_dynamic_shape_enabled(true)
+        .set_kernel_cache(kernel_cache);
+
+    auto X = graph->tensor(
+        fe::graph::Tensor_attributes()
+            .set_name("image")
+            .set_dim({conv_shape.n, conv_shape.c, conv_shape.h, conv_shape.w})
+            .set_stride({conv_shape.c * conv_shape.h * conv_shape.w, 1, conv_shape.c * conv_shape.w, conv_shape.c}));
+
+    auto W = graph->tensor(
+        fe::graph::Tensor_attributes()
+            .set_name("filter")
+            .set_dim({conv_shape.k, conv_shape.c, conv_shape.r, conv_shape.s})
+            .set_stride({conv_shape.c * conv_shape.r * conv_shape.s, 1, conv_shape.c * conv_shape.s, conv_shape.c}));
+
+    auto conv_options = fe::graph::Conv_fprop_attributes()
+                            .set_pre_padding({1, 1})  // padding such that P=H, Q=W
+                            .set_post_padding({0, 0})
+                            .set_stride({1, 1})
+                            .set_dilation({1, 1});
+
+    auto Y1 = graph->conv_fprop(X, W, conv_options);
+    Y1->set_data_type(fe::DataType_t::HALF);
+
+    auto Y = graph->pointwise(Y1,
+                              fe::graph::Pointwise_attributes()
+                                  .set_mode(fe::PointwiseMode_t::RELU_FWD)
+                                  .set_compute_data_type(fe::DataType_t::FLOAT));
+
+    Y->set_output(true);
+    return std::make_tuple(graph, X, W, Y);
+}
+
+TEST_CASE("Benchmark conv graph API runtimes", "[conv][graph][benchmark]") {
+    // SKIP("Very long test turned off by default.");
+
+    if (cudnnGetVersion() < 90500) {
+        SKIP("Test requires cudnn 9.5.0 or above");
+        return;
+    }
+
+    // clang-format off
+    conv_shape_params conv_shapes[] = {
+        {      16,  128,   56,   56,  256,    3,    3},
+        {      16,  128,   80,   80,  256,    3,    3},
+    };
+    // clang-format on
+
+    constexpr int conv_shapes_count = sizeof(conv_shapes) / sizeof(conv_shapes[0]);
+    int64_t max_x_volume = 0, max_w_volume = 0, max_y_volume = 0;
+    for (int idx_shape = 0; idx_shape < conv_shapes_count; ++idx_shape) {
+        const auto &conv_shape = conv_shapes[idx_shape];
+        max_x_volume           = std::max(max_x_volume, conv_shape.n * conv_shape.c * conv_shape.h * conv_shape.w);
+        max_w_volume           = std::max(max_w_volume, conv_shape.k * conv_shape.c * conv_shape.r * conv_shape.s);
+        max_y_volume           = std::max(max_y_volume, conv_shape.n * conv_shape.k * conv_shape.h * conv_shape.w);
+    }
+
+    auto kernel_cache = std::make_shared<fe::KernelCache>();
+
+    const auto build_new_graph = [&conv_shapes, &kernel_cache](cudnnHandle_t handle, int idx_shape) {
+        const auto &conv_shape = conv_shapes[idx_shape];
+
+        if (idx_shape == 1) {
+            BENCHMARK_ADVANCED("Create")(Catch::Benchmark::Chronometer meter) {
+                meter.measure([&] { return create_conv_relu_forward_graph(conv_shape, kernel_cache); });
+            };
+
+            BENCHMARK_ADVANCED("Validate")(Catch::Benchmark::Chronometer meter) {
+                std::vector<std::shared_ptr<fe::graph::Graph>> g(meter.runs());
+                for (int i = 0; i < meter.runs(); ++i) {
+                    auto [graph, X, W, Y] = create_conv_relu_forward_graph(conv_shape, kernel_cache);
+                    g[i]                  = graph;
+                }
+                meter.measure([&](int i) { return g[i]->validate(); });
+            };
+
+            BENCHMARK_ADVANCED("Build backend operation graph")
+            (Catch::Benchmark::Chronometer meter) {
+                std::vector<std::shared_ptr<fe::graph::Graph>> g(meter.runs());
+                for (int i = 0; i < meter.runs(); ++i) {
+                    auto [graph, X, W, Y] = create_conv_relu_forward_graph(conv_shape, kernel_cache);
+                    g[i]                  = graph;
+                    auto status           = graph->validate();
+                }
+                meter.measure([&](int i) { return g[i]->build_operation_graph(handle); });
+            };
+
+            BENCHMARK_ADVANCED("Create execution plans")(Catch::Benchmark::Chronometer meter) {
+                std::vector<std::shared_ptr<fe::graph::Graph>> g(meter.runs());
+                for (int i = 0; i < meter.runs(); ++i) {
+                    auto [graph, X, W, Y] = create_conv_relu_forward_graph(conv_shape, kernel_cache);
+                    g[i]                  = graph;
+                    auto status           = graph->validate();
+                    status                = graph->build_operation_graph(handle);
+                }
+                meter.measure([&](int i) { return g[i]->create_execution_plans({fe::HeurMode_t::A}); });
+            };
+
+            BENCHMARK_ADVANCED("Check support")(Catch::Benchmark::Chronometer meter) {
+                std::vector<std::shared_ptr<fe::graph::Graph>> g(meter.runs());
+                for (int i = 0; i < meter.runs(); ++i) {
+                    auto [graph, X, W, Y] = create_conv_relu_forward_graph(conv_shape, kernel_cache);
+                    g[i]                  = graph;
+                    auto status           = graph->validate();
+                    status                = graph->build_operation_graph(handle);
+                    status                = graph->create_execution_plans({fe::HeurMode_t::A});
+                }
+                meter.measure([&](int i) { return g[i]->check_support(); });
+            };
+
+            BENCHMARK_ADVANCED("Build execution plan")(Catch::Benchmark::Chronometer meter) {
+                std::vector<std::shared_ptr<fe::graph::Graph>> g(meter.runs());
+                for (int i = 0; i < meter.runs(); ++i) {
+                    auto [graph, X, W, Y] = create_conv_relu_forward_graph(conv_shape, kernel_cache);
+                    g[i]                  = graph;
+                    auto status           = graph->validate();
+                    status                = graph->build_operation_graph(handle);
+                    status                = graph->create_execution_plans({fe::HeurMode_t::A});
+                    status                = graph->check_support();
+                }
+                meter.measure([&](int i) { return g[i]->build_plans(); });
+            };
+        }
+
+        auto [graph, X, W, Y] = create_conv_relu_forward_graph(conv_shape, kernel_cache);
+
+        REQUIRE(graph->validate().is_good());
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, Y);
+    };
+
+    const auto execute_graph = [&max_x_volume, &max_w_volume, &max_y_volume](cudnnHandle_t handle,
+                                                                             const fe::graph::Graph *graph,
+                                                                             const fe::graph::Tensor_attributes *X,
+                                                                             const fe::graph::Tensor_attributes *W,
+                                                                             const fe::graph::Tensor_attributes *Y) {
+        Surface<half> x_tensor(max_x_volume, false);
+        Surface<half> w_tensor(max_w_volume, false);
+        Surface<half> y_tensor(max_y_volume, false);
+
+        std::unordered_map<int64_t, void *> variant_pack = {
+            {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+
+        Surface<int8_t> workspace(graph->get_workspace_size(), false);
+
+        REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    for (int idx_shape = 0; idx_shape < conv_shapes_count; ++idx_shape) {
+        auto [graph, X, W, Y] = build_new_graph(handle, idx_shape);
+        execute_graph(handle, graph.get(), X.get(), W.get(), Y.get());
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/convolution/dgrads.cpp b/third_party/cudnn-frontend/samples/cpp/convolution/dgrads.cpp
new file mode 100644
index 00000000..7ae0e1e7
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/convolution/dgrads.cpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Convolution Dgrad", "[dgrad][graph]") {
+    namespace fe = cudnn_frontend;
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("grad")
+                               .set_dim({4, 64, 16, 16})
+                               .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto W  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("weight")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32}));
+
+    auto dgrad_options = fe::graph::Conv_dgrad_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto DX            = graph.conv_dgrad(DY, W, dgrad_options);
+    DX->set_dim({4, 32, 16, 16}).set_output(true);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> w_tensor(64 * 32 * 3 * 3, false);
+    Surface<half> dx_tensor(4 * 32 * 16 * 16, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {DY, dy_tensor.devPtr}, {W, w_tensor.devPtr}, {DX, dx_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Dgrad Drelu Graph", "[dgrad][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("grad")
+                               .set_dim({4, 64, 16, 16})
+                               .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto W  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("weight")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32}));
+
+    auto dgrad_options = fe::graph::Conv_dgrad_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto dgrad_output  = graph.conv_dgrad(DY, W, dgrad_options);
+    dgrad_output->set_dim({4, 32, 16, 16});
+
+    auto X             = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("input")
+                              .set_dim({4, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+    auto drelu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_BWD);
+    auto DX            = graph.pointwise(dgrad_output, X, drelu_options);
+    DX->set_output(true);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> w_tensor(64 * 32 * 3 * 3, false);
+    Surface<half> x_tensor(4 * 32 * 16 * 16, false);
+    Surface<half> dx_tensor(4 * 32 * 16 * 16, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {DY, dy_tensor.devPtr}, {W, w_tensor.devPtr}, {X, x_tensor.devPtr}, {DX, dx_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Dgrad Drelu DBNweight Graph", "[dgrad][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("grad")
+                               .set_dim({4, 64, 16, 16})
+                               .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto W  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("weight")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32}));
+
+    auto dgrad_options = fe::graph::Conv_dgrad_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto dgrad_output  = graph.conv_dgrad(DY, W, dgrad_options);
+    dgrad_output->set_dim({4, 32, 16, 16});
+
+    auto X            = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("image")
+                              .set_dim({4, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+    auto M            = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("mean")
+                              .set_dim({1, 32, 1, 1})
+                              .set_stride({32, 1, 32, 32})
+                              .set_data_type(fe::DataType_t::FLOAT));
+    auto mean_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::SUB);
+    auto M_output     = graph.pointwise(X, M, mean_options);
+
+    auto V               = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("input")
+                              .set_dim({1, 32, 1, 1})
+                              .set_stride({32, 1, 32, 32})
+                              .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_var_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto V_output        = graph.pointwise(M_output, V, inv_var_options);
+
+    auto S             = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("input")
+                              .set_dim({1, 32, 1, 1})
+                              .set_stride({32, 1, 32, 32})
+                              .set_data_type(fe::DataType_t::FLOAT));
+    auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto S_output      = graph.pointwise(V_output, S, scale_options);
+
+    auto B            = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("input")
+                              .set_dim({1, 32, 1, 1})
+                              .set_stride({32, 1, 32, 32})
+                              .set_data_type(fe::DataType_t::FLOAT));
+    auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+    auto B_output     = graph.pointwise(S_output, B, bias_options);
+
+    auto drelu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_BWD);
+    auto drelu_output  = graph.pointwise(dgrad_output, B_output, drelu_options);
+    drelu_output->set_output(true);
+
+    auto dbn_weight_options = fe::graph::DBN_weight_attributes();
+    auto [dscale, dbias, eq_scale_dy, eq_scale_x, eq_bias] =
+        graph.dbn_weight(drelu_output, X, M, V, S, dbn_weight_options);
+    dscale->set_output(true);
+    dbias->set_output(true);
+    eq_scale_dy->set_output(true);
+    eq_scale_x->set_output(true);
+    eq_bias->set_output(true);
+
+#if (CUDNN_VERSION < 90800)
+    SKIP("DgradDreluBNBwdWeight requires cudnn 9.8.0 and up");
+#endif
+    if (!is_ampere_arch() && !is_hopper_arch()) {
+        SKIP("DgradDreluBNBwdWeight requires ampere or hopper architecture.");
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> w_tensor(64 * 32 * 3 * 3, false);
+    Surface<half> x_tensor(4 * 32 * 16 * 16, false);
+    Surface<half> drelu_output_tensor(4 * 32 * 16 * 16, false);
+    Surface<float> mean_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> inv_var_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> scale_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> bias_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> dscale_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> dbias_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> eq_scale_dy_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> eq_scale_x_tensor(1 * 32 * 1 * 1, false);
+    Surface<float> eq_bias_tensor(1 * 32 * 1 * 1, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {DY, dy_tensor.devPtr},
+        {W, w_tensor.devPtr},
+        {X, x_tensor.devPtr},
+        {M, mean_tensor.devPtr},
+        {S, scale_tensor.devPtr},
+        {V, inv_var_tensor.devPtr},
+        {B, bias_tensor.devPtr},
+        {dbias, dbias_tensor.devPtr},
+        {dscale, dscale_tensor.devPtr},
+        {eq_bias, eq_bias_tensor.devPtr},
+        {eq_scale_dy, eq_scale_dy_tensor.devPtr},
+        {eq_scale_x, eq_scale_x_tensor.devPtr},
+        {drelu_output, drelu_output_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/convolution/fp8_fprop.cpp b/third_party/cudnn-frontend/samples/cpp/convolution/fp8_fprop.cpp
new file mode 100644
index 00000000..f310fce9
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/convolution/fp8_fprop.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Convolution fp8 precision", "[conv][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    if (cudnnGetVersion() < 8600) {
+        SKIP("TEST REQUIRES minimum cudnn version 8.6.0");
+    }
+    if (check_device_arch_newer_than("hopper") == false) {
+        SKIP("TEST REQUIRES device  hopper arch or newer");
+    }
+
+    namespace fe = cudnn_frontend;
+    // conv problem size
+    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1;
+
+    // Initialize input tensors with int8_t as proxy for fp8
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("image")
+                               .set_dim({n, c, h, w})
+                               .set_stride({c * h * w, 1, c * w, c})
+                               .set_data_type(fe::DataType_t::FP8_E4M3));
+
+    auto W = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("filter")
+                               .set_dim({k, c, r, s})
+                               .set_stride({c * r * s, 1, c * s, c})
+                               .set_data_type(fe::DataType_t::FP8_E4M3));
+
+    auto conv_options =
+        fe::graph::Conv_fprop_attributes().set_padding({0, 0}).set_stride({1, 1}).set_dilation({1, 1}).set_name("conv");
+    auto conv_output_fp8 = graph->conv_fprop(X, W, conv_options);
+
+    auto descale_x = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("descale_x")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    auto descale_w = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("descale_w")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    auto scale_y = graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("scale_y")
+                                     .set_dim({1, 1, 1, 1})
+                                     .set_stride({1, 1, 1, 1})
+                                     .set_data_type(fe::DataType_t::FLOAT));
+
+    auto scale_options   = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto after_descale_x = graph->pointwise(conv_output_fp8, descale_x, scale_options);
+    auto after_descale_w = graph->pointwise(after_descale_x, descale_w, scale_options);
+    auto Y               = graph->pointwise(after_descale_w, scale_y, scale_options);
+
+    Y->set_output(true).set_data_type(fe::DataType_t::FP8_E4M3);
+
+    auto amax = graph->reduction(after_descale_w,
+                                 fe::graph::Reduction_attributes()
+                                     .set_mode(fe::ReductionMode_t::AMAX)
+                                     .set_compute_data_type(fe::DataType_t::FLOAT));
+
+    amax->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1});
+
+    REQUIRE(graph->validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph->check_support(handle).is_good());
+
+    REQUIRE(graph->build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Use int8_t as proxy for fp8
+    Surface<int8_t> X_gpu(n * c * h * w, false);
+    Surface<int8_t> W_gpu(k * c * r * s, false);
+    Surface<int8_t> Y_gpu(n * k * h * w, false);
+
+    Surface<float> X_descale_gpu(1, false);
+    Surface<float> W_descale_gpu(1, false);
+    Surface<float> Y_scale_gpu(1, false);
+    Surface<float> amax_gpu(1, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_gpu.devPtr},
+        {W, W_gpu.devPtr},
+        {Y, Y_gpu.devPtr},
+        {descale_x, X_descale_gpu.devPtr},
+        {descale_w, W_descale_gpu.devPtr},
+        {scale_y, Y_scale_gpu.devPtr},
+        {amax, amax_gpu.devPtr}};
+
+    std::cout << graph->print() << std::endl;
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/convolution/fprop.cpp b/third_party/cudnn-frontend/samples/cpp/convolution/fprop.cpp
new file mode 100644
index 00000000..0a7ffe3f
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/convolution/fprop.cpp
@@ -0,0 +1,830 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
+    namespace fe = cudnn_frontend;
+
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+
+    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1;
+
+    auto build_new_graph = [=](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({0, 0}).set_stride({1, 1}).set_dilation({1, 1});
+        auto Y = graph->conv_fprop(X, W, conv_options);
+
+        Y->set_output(true);
+
+        REQUIRE(graph->validate().is_good());
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, Y);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto [graph, X, W, Y] = build_new_graph(handle);
+
+    Surface<half> x_tensor(n * c * h * w, false);
+    Surface<half> w_tensor(k * c * r * s, false);
+    Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::cout << *graph << std::endl;
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Convolution fprop dynamic shape", "[conv][graph][dynamic_shape]") {
+    namespace fe = cudnn_frontend;
+
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    // clang-format off
+    struct {
+        int64_t n,    c,    h,    w,    k,    r,    s;
+    } conv_shapes[] = {
+        {      16,  128,   56,   56,  256,    3,    3},
+        {      16,  128,   64,   64,  256,    3,    3},
+        {      16,  128,   80,   64,  256,    3,    3},
+        {      32,  128,   80,   80,  256,    3,    3},
+        {      32,  256,   32,   32,  256,    3,    3},
+    };
+    // clang-format on
+
+    constexpr int conv_shapes_count = sizeof(conv_shapes) / sizeof(conv_shapes[0]);
+    int64_t max_x_volume = 0, max_w_volume = 0, max_y_volume = 0;
+    for (int idx_shape = 0; idx_shape < conv_shapes_count; ++idx_shape) {
+        const auto &conv_shape = conv_shapes[idx_shape];
+        max_x_volume           = std::max(max_x_volume, conv_shape.n * conv_shape.c * conv_shape.h * conv_shape.w);
+        max_w_volume           = std::max(max_w_volume, conv_shape.k * conv_shape.c * conv_shape.r * conv_shape.s);
+        max_y_volume           = std::max(max_y_volume, conv_shape.n * conv_shape.k * conv_shape.h * conv_shape.w);
+    }
+
+    auto kernel_cache = std::make_shared<fe::KernelCache>();
+
+    const auto build_new_graph = [&conv_shapes, &kernel_cache](cudnnHandle_t handle, int idx_shape) {
+        const auto &conv_shape = conv_shapes[idx_shape];
+
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_compute_data_type(fe::DataType_t::FLOAT)
+            .set_dynamic_shape_enabled(true)
+            .set_kernel_cache(kernel_cache);
+
+        auto X = graph->tensor(
+            fe::graph::Tensor_attributes()
+                .set_name("image")
+                .set_dim({conv_shape.n, conv_shape.c, conv_shape.h, conv_shape.w})
+                .set_stride(
+                    {conv_shape.c * conv_shape.h * conv_shape.w, 1, conv_shape.c * conv_shape.w, conv_shape.c}));
+
+        auto W = graph->tensor(
+            fe::graph::Tensor_attributes()
+                .set_name("filter")
+                .set_dim({conv_shape.k, conv_shape.c, conv_shape.r, conv_shape.s})
+                .set_stride(
+                    {conv_shape.c * conv_shape.r * conv_shape.s, 1, conv_shape.c * conv_shape.s, conv_shape.c}));
+
+        auto conv_options = fe::graph::Conv_fprop_attributes()
+                                .set_pre_padding({1, 1})  // padding such that P=H, Q=W
+                                .set_post_padding({0, 0})
+                                .set_stride({1, 1})
+                                .set_dilation({1, 1});
+
+        auto Y1 = graph->conv_fprop(X, W, conv_options);
+        Y1->set_data_type(fe::DataType_t::HALF);
+
+        auto Y = graph->pointwise(Y1,
+                                  fe::graph::Pointwise_attributes()
+                                      .set_mode(fe::PointwiseMode_t::RELU_FWD)
+                                      .set_compute_data_type(fe::DataType_t::FLOAT));
+
+        Y->set_output(true);
+        auto status = graph->validate();
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Dynamic shapes not supported pre 9.4");
+        }
+
+        status = graph->build_operation_graph(handle);
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Kernel cache not supported pre 9.4");
+        }
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, Y);
+    };
+
+    const auto execute_graph = [&max_x_volume, &max_w_volume, &max_y_volume](cudnnHandle_t handle,
+                                                                             const fe::graph::Graph *graph,
+                                                                             const fe::graph::Tensor_attributes *X,
+                                                                             const fe::graph::Tensor_attributes *W,
+                                                                             const fe::graph::Tensor_attributes *Y) {
+        Surface<half> x_tensor(max_x_volume, false);
+        Surface<half> w_tensor(max_w_volume, false);
+        Surface<half> y_tensor(max_y_volume, false);
+
+        std::unordered_map<int64_t, void *> variant_pack = {
+            {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+
+        Surface<int8_t> workspace(graph->get_workspace_size(), false);
+
+        std::cout << *graph << std::endl;
+
+        REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    for (int idx_shape = 0; idx_shape < conv_shapes_count; ++idx_shape) {
+        auto [graph, X, W, Y] = build_new_graph(handle, idx_shape);
+        execute_graph(handle, graph.get(), X.get(), W.get(), Y.get());
+    }
+}
+
+TEST_CASE("CSBR Graph", "[conv][graph][caching]") {
+    namespace fe = cudnn_frontend;
+
+    int64_t n = 8, c = 32, h = 16, w = 16, k = 64, r = 3, s = 3;
+
+    bool cache_hit = true;
+
+    using graph_and_tensors = std::tuple<std::shared_ptr<fe::graph::Graph>,
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // X
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // W
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // S
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // B
+                                         std::shared_ptr<fe::graph::Tensor_attributes>   // Y
+                                         >;
+
+    std::unordered_map<std::size_t, graph_and_tensors> user_maintained_cache;
+
+    auto lookup_cache_or_build_graph = [n, c, h, w, k, r, s, &cache_hit, &user_maintained_cache](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto conv_output = graph->conv_fprop(X, W, conv_options);
+
+        auto S = graph->tensor(
+            fe::graph::Tensor_attributes().set_name("scale").set_dim({1, k, 1, 1}).set_stride({k, 1, k, k}));
+        auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+        auto scale_output  = graph->pointwise(conv_output, S, scale_options);
+
+        auto B = graph->tensor(
+            fe::graph::Tensor_attributes().set_name("bias").set_dim({1, k, 1, 1}).set_stride({k, 1, k, k}));
+        auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+        auto bias_output  = graph->pointwise(scale_output, B, bias_options);
+
+        auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+        auto Y            = graph->pointwise(bias_output, relu_options);
+        Y->set_output(true);
+
+        REQUIRE(graph->validate().is_good());
+
+        auto key = graph->key();
+
+        auto it = user_maintained_cache.find(key);
+
+        if (it != user_maintained_cache.end()) {
+            cache_hit = true;
+            return it->second;
+        }
+
+        cache_hit = false;
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        user_maintained_cache.insert({key, std::make_tuple(graph, X, W, S, B, Y)});
+
+        return std::make_tuple(graph, X, W, S, B, Y);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto [graph, X, W, B, S, Y] = lookup_cache_or_build_graph(handle);
+
+    REQUIRE(cache_hit == false);
+
+    Surface<half> x_tensor(n * c * h * w, false);
+    Surface<half> w_tensor(k * c * r * s, false);
+    Surface<half> s_tensor(k, false);
+    Surface<half> b_tensor(k, false);
+    Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack = {
+        {X, x_tensor.devPtr}, {W, w_tensor.devPtr}, {S, s_tensor.devPtr}, {B, b_tensor.devPtr}, {Y, y_tensor.devPtr}};
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    auto [graph_, X_, W_, B_, S_, Y_] = lookup_cache_or_build_graph(handle);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack_ = {{X_, x_tensor.devPtr},
+                                                                                               {W_, w_tensor.devPtr},
+                                                                                               {S_, s_tensor.devPtr},
+                                                                                               {B_, b_tensor.devPtr},
+                                                                                               {Y_, y_tensor.devPtr}};
+
+    REQUIRE(graph_->execute(handle, variant_pack_, workspace.devPtr).is_good());
+
+    REQUIRE(cache_hit == true);
+}
+
+TEST_CASE("CSBR Graph dynamic shape", "[conv][graph][dynamic_shape]") {
+    namespace fe = cudnn_frontend;
+
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    // clang-format off
+    struct {
+        int64_t n,    c,    h,    w,    k,    r,    s;
+    } conv_shapes[] = {
+        {       8,   32,   16,   16,   64,    3,    3},
+        {       8,   32,   24,   24,   64,    3,    3},
+        {      16,   32,   32,   32,   64,    3,    3},
+        {      16,   64,   32,   32,   64,    3,    3},
+        {      16,   16,   64,   64,   16,    3,    3},
+    };
+    // clang-format on
+
+    constexpr int conv_shapes_count = sizeof(conv_shapes) / sizeof(conv_shapes[0]);
+    int64_t max_x_volume = 0, max_w_volume = 0, max_y_volume = 0, max_k = 0;
+    for (int idx_shape = 0; idx_shape < conv_shapes_count; ++idx_shape) {
+        const auto &conv_shape = conv_shapes[idx_shape];
+        max_x_volume           = std::max(max_x_volume, conv_shape.n * conv_shape.c * conv_shape.h * conv_shape.w);
+        max_w_volume           = std::max(max_w_volume, conv_shape.k * conv_shape.c * conv_shape.r * conv_shape.s);
+        max_y_volume           = std::max(max_y_volume, conv_shape.n * conv_shape.k * conv_shape.h * conv_shape.w);
+        max_k                  = std::max(max_k, conv_shape.k);
+    }
+
+    auto kernel_cache = std::make_shared<fe::KernelCache>();
+
+    auto lookup_cache_or_build_graph = [&conv_shapes, &kernel_cache](cudnnHandle_t handle, int idx_shape) {
+        const auto &conv_shape = conv_shapes[idx_shape];
+
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT)
+            .set_dynamic_shape_enabled(true)
+            .set_kernel_cache(kernel_cache);
+
+        auto X = graph->tensor(
+            fe::graph::Tensor_attributes()
+                .set_name("image")
+                .set_dim({conv_shape.n, conv_shape.c, conv_shape.h, conv_shape.w})
+                .set_stride(
+                    {conv_shape.c * conv_shape.h * conv_shape.w, 1, conv_shape.c * conv_shape.w, conv_shape.c}));
+
+        auto W = graph->tensor(
+            fe::graph::Tensor_attributes()
+                .set_name("filter")
+                .set_dim({conv_shape.k, conv_shape.c, conv_shape.r, conv_shape.s})
+                .set_stride(
+                    {conv_shape.c * conv_shape.r * conv_shape.s, 1, conv_shape.c * conv_shape.s, conv_shape.c}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+
+        auto conv_output = graph->conv_fprop(X, W, conv_options);
+
+        auto S = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("scale")
+                                   .set_dim({1, conv_shape.k, 1, 1})
+                                   .set_stride({conv_shape.k, 1, conv_shape.k, conv_shape.k}));
+
+        auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+        auto scale_output  = graph->pointwise(conv_output, S, scale_options);
+
+        auto B = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("bias")
+                                   .set_dim({1, conv_shape.k, 1, 1})
+                                   .set_stride({conv_shape.k, 1, conv_shape.k, conv_shape.k}));
+
+        auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+        auto bias_output  = graph->pointwise(scale_output, B, bias_options);
+
+        auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+        auto Y            = graph->pointwise(bias_output, relu_options);
+        Y->set_output(true);
+
+        auto status = graph->validate();
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Dynamic shapes not supported pre 9.4");
+        }
+
+        status = graph->build_operation_graph(handle);
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Kernel cache not supported pre 9.4");
+        }
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, S, B, Y);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    for (int idx_shape = 0; idx_shape < conv_shapes_count; idx_shape++) {
+        auto [graph, X, W, B, S, Y] = lookup_cache_or_build_graph(handle, idx_shape);
+
+        Surface<half> x_tensor(max_x_volume, false);
+        Surface<half> w_tensor(max_w_volume, false);
+        Surface<half> s_tensor(max_k, false);
+        Surface<half> b_tensor(max_k, false);
+        Surface<half> y_tensor(max_y_volume, false);  // Should be p, q.
+
+        Surface<int8_t> workspace(graph->get_workspace_size(), false);
+        std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack = {{X, x_tensor.devPtr},
+                                                                                                  {W, w_tensor.devPtr},
+                                                                                                  {S, s_tensor.devPtr},
+                                                                                                  {B, b_tensor.devPtr},
+                                                                                                  {Y, y_tensor.devPtr}};
+
+        REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+    }
+}
+
+TEST_CASE("SBRCS", "[conv][genstats][graph]") {
+    if (!is_ampere_arch() && !is_hopper_arch()) {
+        SKIP("scale-bias-relu-covn-genstats requires Ampere or Hopper");
+    }
+
+    namespace fe = cudnn_frontend;
+
+    int64_t n = 4, c = 64, h = 16, w = 16, k = 32, r = 3, s = 3;
+
+    auto build_new_graph = [=](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto S = graph->tensor(
+            fe::graph::Tensor_attributes().set_name("scale").set_dim({1, c, 1, 1}).set_stride({c, 1, c, c}));
+
+        auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+        auto scale_output  = graph->pointwise(X, S, scale_options);
+
+        auto B = graph->tensor(
+            fe::graph::Tensor_attributes().set_name("bias").set_dim({1, c, 1, 1}).set_stride({c, 1, c, c}));
+        auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+        auto bias_output  = graph->pointwise(scale_output, B, bias_options);
+
+        auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+        auto relu_output  = graph->pointwise(bias_output, relu_options);
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto Y = graph->conv_fprop(relu_output, W, conv_options);
+        Y->set_output(true);
+
+        auto genstats_options = fe::graph::Genstats_attributes();
+        auto [SUM, SQ_SUM]    = graph->genstats(Y, genstats_options);
+
+        SUM->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+        SQ_SUM->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+        REQUIRE(graph->validate().is_good());
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, S, B, Y, SUM, SQ_SUM);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+#if (CUDNN_VERSION < 8800)
+    SKIP("SBRCS requires cudnn 8.8 and up");
+#endif
+    if (!is_ampere_arch() && !is_hopper_arch()) {
+        SKIP("SBRCS requires Ampere or Hopper");
+    }
+
+    auto [graph, X, W, B, S, Y, SUM, SQ_SUM] = build_new_graph(handle);
+
+    Surface<half> x_tensor(n * c * h * w, false);
+    Surface<half> w_tensor(k * c * r * s, false);
+    Surface<half> s_tensor(c, false);
+    Surface<half> b_tensor(c, false);
+    Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+    Surface<float> sum_tensor(k, false);
+    Surface<float> sq_sum_tensor(k, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack = {
+        {X, x_tensor.devPtr},
+        {S, s_tensor.devPtr},
+        {B, b_tensor.devPtr},
+        {W, w_tensor.devPtr},
+        {Y, y_tensor.devPtr},
+        {SUM, sum_tensor.devPtr},
+        {SQ_SUM, sq_sum_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("CBR Graph NCHW", "[conv][graph][caching]") {
+    namespace fe = cudnn_frontend;
+
+    int64_t n = 8, c = 32, h = 16, w = 16, k = 64, r = 3, s = 3;
+
+    bool cache_hit = true;
+
+    using graph_and_tensors = std::tuple<std::shared_ptr<fe::graph::Graph>,
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // X
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // W
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // Z
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // B
+                                         std::shared_ptr<fe::graph::Tensor_attributes>   // Y
+                                         >;
+
+    std::unordered_map<std::size_t, graph_and_tensors> user_maintained_cache;
+
+    auto lookup_cache_or_build_graph = [n, c, h, w, k, r, s, &cache_hit, &user_maintained_cache](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, h * w, w, 1}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, r * s, s, 1}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto conv_output = graph->conv_fprop(X, W, conv_options);
+
+        auto Z = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, k, h, w})
+                                   .set_stride({k * h * w, h * w, w, 1}));  // Should be p,q
+
+        auto add_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+        auto add_output  = graph->pointwise(conv_output, Z, add_options);
+
+        auto B = graph->tensor(
+            fe::graph::Tensor_attributes().set_name("bias").set_dim({1, k, 1, 1}).set_stride({k, 1, 1, 1}));
+        auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+        auto bias_output  = graph->pointwise(add_output, B, bias_options);
+
+        auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+        auto Y            = graph->pointwise(bias_output, relu_options);
+        Y->set_output(true).set_stride({k * h * w, h * w, w, 1});
+
+        REQUIRE(graph->validate().is_good());
+
+        auto key = graph->key();
+
+        auto it = user_maintained_cache.find(key);
+
+        if (it != user_maintained_cache.end()) {
+            cache_hit = true;
+            return it->second;
+        }
+
+        cache_hit = false;
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        user_maintained_cache.insert({key, std::make_tuple(graph, X, W, Z, B, Y)});
+
+        return std::make_tuple(graph, X, W, Z, B, Y);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto [graph, X, W, Z, B, Y] = lookup_cache_or_build_graph(handle);
+
+    REQUIRE(cache_hit == false);
+
+    Surface<half> x_tensor(n * c * h * w, false);
+    Surface<half> w_tensor(k * c * r * s, false);
+    Surface<half> b_tensor(k, false);
+    Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+    Surface<half> z_tensor(n * k * h * w, false);  // Should be p, q.
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack = {
+        {X, x_tensor.devPtr}, {W, w_tensor.devPtr}, {B, b_tensor.devPtr}, {Z, z_tensor.devPtr}, {Y, y_tensor.devPtr}};
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    auto [graph_, X_, W_, Z_, B_, Y_] = lookup_cache_or_build_graph(handle);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack_ = {{X_, x_tensor.devPtr},
+                                                                                               {W_, w_tensor.devPtr},
+                                                                                               {B_, b_tensor.devPtr},
+                                                                                               {Z_, z_tensor.devPtr},
+                                                                                               {Y_, y_tensor.devPtr}};
+
+    REQUIRE(graph_->execute(handle, variant_pack_, workspace.devPtr).is_good());
+
+    REQUIRE(cache_hit == true);
+}
+
+TEST_CASE("Convolution fprop large", "[conv][graph][caching]") {
+    namespace fe = cudnn_frontend;
+
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+
+#if (CUDNN_VERSION < 90300)
+    SKIP("Large tensors > int32_t require cudnn 9.3.0 and up.");
+#endif
+
+    int64_t n = 1, c = 128, d = 128, h = 363, w = 363, k = 128, t = 3, r = 3, s = 3;
+
+    auto build_new_graph = [=](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, d, h, w})
+                                   .set_stride({c * d * h * w, 1, c * h * w, c * w, c}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, t, r, s})
+                                   .set_stride({c * t * r * s, 1, c * r * s, c * s, c}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({1, 1, 1}).set_stride({1, 1, 1}).set_dilation({1, 1, 1});
+        auto Y = graph->conv_fprop(X, W, conv_options);
+
+        Y->set_output(true);
+
+        REQUIRE(graph->validate().is_good());
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, Y);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto [graph, X, W, Y] = build_new_graph(handle);
+
+    Surface<half> x_tensor(n * c * d * h * w, false);
+    Surface<half> w_tensor(k * c * t * r * s, false);
+    Surface<half> y_tensor(n * k * d * h * w, false);  // Should be p, q.
+
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::cout << *graph << std::endl;
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Convolution fprop concatenate", "[conv][graph][caching]") {
+    namespace fe = cudnn_frontend;
+
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+#if (CUDNN_VERSION < 90800)
+    SKIP("fprop concatenate fusion requires cudnn 9.8.0 and up.");
+#endif
+
+    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1;
+
+    auto axis = 1;
+
+    auto in_place_index = 1;
+
+    auto build_new_graph = [=](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({0, 0}).set_stride({1, 1}).set_dilation({1, 1});
+        auto Y = graph->conv_fprop(X, W, conv_options);
+
+        Y->set_data_type(fe::DataType_t::HALF);
+
+        auto Y0 = graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("concatenate input")
+                                    .set_dim({n, k, h, w})
+                                    .set_stride({k * h * w, 1, k * w, k}));
+
+        std::vector<std::shared_ptr<fe::graph::Tensor_attributes>> inputs;
+        inputs.push_back(Y);
+        inputs.push_back(Y0);
+
+        auto concatenate_options =
+            fe::graph::Concatenate_attributes().set_axis(axis).set_in_place_index(in_place_index);
+
+        auto Y1 = graph->concatenate(inputs, concatenate_options);
+
+        Y1->set_output(true);
+
+        REQUIRE(graph->validate().is_good());
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, Y, Y0, Y1);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto [graph, X, W, Y, Y0, Y1] = build_new_graph(handle);
+
+    Surface<half> x_tensor(n * c * h * w, false);
+    Surface<half> w_tensor(k * c * r * s, false);
+    Surface<half> y_tensor(n * k * h * w, false);       // Should be p, q.
+    Surface<half> y0_tensor(n * k * h * w, false);      // Should be p, q.
+    Surface<half> y1_tensor(n * 2 * k * h * w, false);  // Should be p, q.
+
+    std::unordered_map<int64_t, void *> variant_pack = {{X->get_uid(), x_tensor.devPtr},
+                                                        {W->get_uid(), w_tensor.devPtr},
+                                                        {Y->get_uid(), y_tensor.devPtr},
+                                                        {Y0->get_uid(), y0_tensor.devPtr},
+                                                        {Y1->get_uid(), y1_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::cout << *graph << std::endl;
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/convolution/int8_fprop.cpp b/third_party/cudnn-frontend/samples/cpp/convolution/int8_fprop.cpp
new file mode 100644
index 00000000..59b5ef4a
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/convolution/int8_fprop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Conv with Int8 datatypes", "[conv][graph][caching]") {
+    namespace fe = cudnn_frontend;
+
+    int64_t n = 1, c = 64, h = 32, w = 32, k = 4, r = 3, s = 3;
+
+    bool const include_identity = true;
+
+    auto build_new_graph = [=](cudnnHandle_t handle) {
+        auto graph = std::make_shared<fe::graph::Graph>();
+        graph->set_io_data_type(fe::DataType_t::INT8)
+            .set_intermediate_data_type(fe::DataType_t::INT32)
+            .set_compute_data_type(fe::DataType_t::INT32);
+
+        auto X = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto conv_output = graph->conv_fprop(X, W, conv_options);
+        auto Y           = conv_output;
+
+        if (include_identity) {
+            auto identity = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::IDENTITY);
+            Y             = graph->pointwise(conv_output, conv_output, identity);
+        }
+
+        Y->set_output(true).set_data_type(fe::DataType_t::INT32);
+
+        REQUIRE(graph->validate().is_good());
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        return std::make_tuple(graph, X, W, Y);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+#if (CUDNN_VERSION < 8600)
+    SKIP("Conv Int8 requires cudnn 8.6 and up");
+#endif
+
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("Int8 datatype convolutions require Ampere and later architectures");
+    }
+
+    auto [graph, X, W, Y] = build_new_graph(handle);
+
+    Surface<int8_t> x_tensor(n * c * h * w, false);
+    Surface<int8_t> w_tensor(k * c * r * s, false);
+    Surface<int32_t> y_tensor(n * k * h * w, false);  // Should be p, q.
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, x_tensor.devPtr}, {W, w_tensor.devPtr}, {Y, y_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/convolution/wgrads.cpp b/third_party/cudnn-frontend/samples/cpp/convolution/wgrads.cpp
new file mode 100644
index 00000000..4ed6ab27
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/convolution/wgrads.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Convolution Wgrad", "[wgrad][graph][wgrad][Conv_wgrad]") {
+    namespace fe = cudnn_frontend;
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::HALF)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X             = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("image")
+                              .set_dim({4, 64, 16, 16})
+                              .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto DY            = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("grad")
+                               .set_dim({4, 64, 16, 16})
+                               .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto wgrad_options = fe::graph::Conv_wgrad_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto DW            = graph.conv_wgrad(DY, X, wgrad_options);
+    DW->set_output(true).set_dim({64, 64, 3, 3});
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> x_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> dw_tensor(64 * 64 * 3 * 3, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, x_tensor.devPtr}, {DY, dy_tensor.devPtr}, {DW, dw_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("scale-bias-relu-wgrad Graph", "[wgrad][graph][scale-bias-relu-wgrad][ConvBNwgrad]") {
+    if (!is_ampere_arch() && !is_hopper_arch()) {
+        SKIP("scale-bias-relu-wgrad requires Ampere or Hopper");
+    }
+
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::HALF)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("image")
+                              .set_dim({4, 64, 16, 16})
+                              .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto S = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("scale").set_dim({1, 64, 1, 1}).set_stride({64, 1, 64, 64}));
+
+    auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto scale_output  = graph.pointwise(X, S, scale_options);
+
+    auto B = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("bias").set_dim({1, 64, 1, 1}).set_stride({64, 1, 64, 64}));
+    auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+    auto bias_output  = graph.pointwise(scale_output, B, bias_options);
+
+    auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+    auto relu_output  = graph.pointwise(bias_output, relu_options);
+
+    auto DY            = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("grad")
+                               .set_dim({4, 64, 16, 16})
+                               .set_stride({64 * 16 * 16, 1, 64 * 16, 64}));
+    auto wgrad_options = fe::graph::Conv_wgrad_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto DW            = graph.conv_wgrad(DY, relu_output, wgrad_options);
+    DW->set_output(true).set_dim({64, 64, 3, 3});
+
+#if (CUDNN_VERSION < 8800)
+    SKIP("ConvBNwgrad requires cudnn 8.8 and up");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("ConvBNwgrad requires hopper and above architecture.");
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> x_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> s_tensor(64, false);
+    Surface<half> b_tensor(64, false);
+    Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+    Surface<half> dw_tensor(64 * 64 * 3 * 3, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{X, x_tensor.devPtr},
+                                                                                             {S, s_tensor.devPtr},
+                                                                                             {B, b_tensor.devPtr},
+                                                                                             {DY, dy_tensor.devPtr},
+                                                                                             {DW, dw_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/blackwell_nvfp4_mxfp8_block_scale_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/blackwell_nvfp4_mxfp8_block_scale_matmul.cpp
new file mode 100644
index 00000000..595920db
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/blackwell_nvfp4_mxfp8_block_scale_matmul.cpp
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <catch2/generators/catch_generators.hpp>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+#include <cudnn_frontend_utils.h>
+
+namespace BlackwellNVFP4MXFP8BlockScaleMatmul {
+
+namespace fe = cudnn_frontend;
+
+struct TestParams {
+    int64_t b = -1;
+    int64_t m = -1;
+    int64_t n = -1;
+    int64_t k = -1;
+
+    int32_t block_size = -1;
+
+    cudnn_frontend::DataType_t datatype_a = cudnn_frontend::DataType_t::NOT_SET;
+    cudnn_frontend::DataType_t datatype_b = cudnn_frontend::DataType_t::NOT_SET;
+
+    cudnn_frontend::DataType_t datatype_block_scale = cudnn_frontend::DataType_t::NOT_SET;
+
+    cudnn_frontend::DataType_t datatype_d = cudnn_frontend::DataType_t::NOT_SET;
+
+    TestParams(int64_t b_,
+               int64_t m_,
+               int64_t n_,
+               int64_t k_,
+               int32_t block_size_,
+               cudnn_frontend::DataType_t datatype_a_,
+               cudnn_frontend::DataType_t datatype_b_,
+               cudnn_frontend::DataType_t datatype_block_scale_,
+               cudnn_frontend::DataType_t datatype_d_) {
+        b = b_;
+        m = m_;
+        n = n_;
+        k = k_;
+
+        block_size = block_size_;
+
+        datatype_a           = datatype_a_;
+        datatype_b           = datatype_b_;
+        datatype_block_scale = datatype_block_scale_;
+        datatype_d           = datatype_d_;
+    }
+};
+
+TEST_CASE("Blackwell Block Scale Matmul", "[matmul][graph][FP4]") {
+#if (CUDNN_VERSION < 90700)
+    SKIP("Matmul with block scaling is not supported in cudnn versions prior to 9.7.0");
+#endif
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("Hardware accelerated NVFP4/MXFP8 block scale matmul requires Blackwell and up");
+    }
+
+    auto test_params = GENERATE(
+        // TestParams(1,
+        //            128,
+        //            128,
+        //            64,
+        //            16,
+        //            cudnn_frontend::DataType_t::FP4_E2M1,
+        //            cudnn_frontend::DataType_t::FP4_E2M1,
+        //            cudnn_frontend::DataType_t::FP8_E4M3,
+        //            cudnn_frontend::DataType_t::FLOAT),
+        TestParams(1,
+                   128,
+                   128,
+                   64,
+                   16,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(1,
+                   128,
+                   128,
+                   64,
+                   16,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(3,
+                   137,
+                   268,
+                   160,
+                   16,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   16,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   16,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP4_E2M1,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(1,
+                   128,
+                   128,
+                   128,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(3,
+                   137,
+                   268,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(3,
+                   137,
+                   268,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::BFLOAT16),
+        TestParams(3,
+                   137,
+                   268,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::FLOAT),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::HALF),
+        TestParams(3,
+                   137,
+                   272,
+                   160,
+                   32,
+                   cudnn_frontend::DataType_t::FP8_E5M2,
+                   cudnn_frontend::DataType_t::FP8_E4M3,
+                   cudnn_frontend::DataType_t::FP8_E8M0,
+                   cudnn_frontend::DataType_t::BFLOAT16));
+
+    int64_t const b = test_params.b;
+    int64_t const m = test_params.m;
+    int64_t const n = test_params.n;
+    int64_t const k = test_params.k;
+
+    auto datatype_a = test_params.datatype_a;
+    auto datatype_b = test_params.datatype_b;
+
+    auto datatype_block_scale = test_params.datatype_block_scale;
+
+    auto block_size = test_params.block_size;
+
+    auto datatype_d = test_params.datatype_d;
+
+    Surface<int8_t> tensor_a_gpu(div_up(b * m * k * cudnn_frontend::detail::get_element_size_in_bits(datatype_a), 8),
+                                 false);
+    Surface<int8_t> tensor_b_gpu(div_up(b * k * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_b), 8),
+                                 false);
+
+    static constexpr int indestructible_128x4_block_m_n = 128;
+    static constexpr int indestructible_128x4_block_k   = 4;
+
+    int64_t block_scale_dim_m = div_up(m, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_n = div_up(n, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_k =
+        div_up(div_up(k, block_size), indestructible_128x4_block_k) * indestructible_128x4_block_k;
+
+    Surface<int8_t> block_descale_a_gpu(
+        div_up(b * block_scale_dim_m * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+    Surface<int8_t> block_descale_b_gpu(
+        div_up(b * block_scale_dim_n * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+
+    Surface<int8_t> tensor_d_gpu(div_up(b * m * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_d), 8),
+                                 false);
+
+    fe::graph::Graph graph{};
+
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT);
+    graph.set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_a = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_a")
+                                     .set_data_type(datatype_a)
+                                     .set_dim({b, m, k})
+                                     .set_stride({m * k, k, 1}));
+
+    auto tensor_b = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_b")
+                                     .set_data_type(datatype_b)
+                                     .set_dim({b, k, n})
+                                     .set_stride({k * n, 1, k}));
+
+    auto block_descale_a = graph.tensor(fe::graph::Tensor_attributes()
+                                            .set_name("block_descale_a")
+                                            .set_data_type(datatype_block_scale)
+                                            .set_dim({b, block_scale_dim_m, block_scale_dim_k})
+                                            .set_stride({block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1})
+                                            .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto block_descale_b = graph.tensor(fe::graph::Tensor_attributes()
+                                            .set_name("block_descale_b")
+                                            .set_data_type(datatype_block_scale)
+                                            .set_dim({b, block_scale_dim_k, block_scale_dim_n})
+                                            .set_stride({block_scale_dim_m * block_scale_dim_k, 1, block_scale_dim_k})
+                                            .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto dequantize_attr_a = fe::graph::Block_scale_dequantize_attributes().set_block_size({1, block_size});
+
+    auto dequan_tensor_a = graph.block_scale_dequantize(tensor_a, block_descale_a, dequantize_attr_a);
+
+    auto dequantize_attr_b = fe::graph::Block_scale_dequantize_attributes().set_block_size({block_size, 1});
+
+    auto dequan_tensor_b = graph.block_scale_dequantize(tensor_b, block_descale_b, dequantize_attr_b);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_d = graph.matmul(dequan_tensor_a, dequan_tensor_b, matmul_attributes);
+
+    tensor_d->set_data_type(datatype_d);
+    tensor_d->set_is_virtual(false);
+
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {tensor_a, tensor_a_gpu.devPtr},
+        {tensor_b, tensor_b_gpu.devPtr},
+        {block_descale_a, block_descale_a_gpu.devPtr},
+        {block_descale_b, block_descale_b_gpu.devPtr},
+        {tensor_d, tensor_d_gpu.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Blackwell Block Scale Matmul Quantize", "[matmul][graph][FP4]") {
+#if (CUDNN_VERSION < 91400)
+    SKIP("Matmul with block scaling quantize is not supported in cudnn versions prior to 9.14.0");
+#endif
+    if (get_compute_capability() < 100 || get_compute_capability() >= 110) {
+        SKIP("Matmul with block scaling quantize is only supported on compute capability 100-109");
+    }
+
+    auto test_params = GENERATE(TestParams(1,
+                                           256,
+                                           256,
+                                           256,
+                                           16,
+                                           cudnn_frontend::DataType_t::FP4_E2M1,
+                                           cudnn_frontend::DataType_t::FP4_E2M1,
+                                           cudnn_frontend::DataType_t::FP8_E4M3,
+                                           cudnn_frontend::DataType_t::FP4_E2M1));
+
+    int64_t const b = test_params.b;
+    int64_t const m = test_params.m;
+    int64_t const n = test_params.n;
+    int64_t const k = test_params.k;
+
+    auto datatype_a = test_params.datatype_a;
+    auto datatype_b = test_params.datatype_b;
+
+    auto datatype_block_scale = test_params.datatype_block_scale;
+
+    auto block_size = test_params.block_size;
+
+    auto datatype_d = test_params.datatype_d;
+
+    Surface<int8_t> tensor_a_gpu(div_up(b * m * k * cudnn_frontend::detail::get_element_size_in_bits(datatype_a), 8),
+                                 false);
+    Surface<int8_t> tensor_b_gpu(div_up(b * k * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_b), 8),
+                                 false);
+
+    static constexpr int indestructible_128x4_block_m_n = 128;
+    static constexpr int indestructible_128x4_block_k   = 4;
+
+    int64_t block_scale_dim_m = div_up(m, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_n = div_up(n, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_k =
+        div_up(div_up(k, block_size), indestructible_128x4_block_k) * indestructible_128x4_block_k;
+    int64_t block_scale_dim_out_m = block_scale_dim_m;
+    int64_t block_scale_dim_out_n =
+        div_up(div_up(n, block_size), indestructible_128x4_block_k) * indestructible_128x4_block_k;
+
+    Surface<int8_t> block_descale_a_gpu(
+        div_up(b * block_scale_dim_m * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+    Surface<int8_t> block_descale_b_gpu(
+        div_up(b * block_scale_dim_n * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+
+    Surface<int8_t> tensor_d_gpu(div_up(b * m * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_d), 8),
+                                 false);
+
+    Surface<int8_t> block_scale_gpu(div_up(b * block_scale_dim_out_m * block_scale_dim_out_n *
+                                               cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+                                           8),
+                                    false);
+
+    fe::graph::Graph graph{};
+
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT);
+    graph.set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_a = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_a")
+                                     .set_data_type(datatype_a)
+                                     .set_dim({b, m, k})
+                                     .set_stride({m * k, k, 1}));
+
+    auto tensor_b = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_b")
+                                     .set_data_type(datatype_b)
+                                     .set_dim({b, k, n})
+                                     .set_stride({k * n, 1, k}));
+
+    auto block_descale_a = graph.tensor(fe::graph::Tensor_attributes()
+                                            .set_name("block_descale_a")
+                                            .set_data_type(datatype_block_scale)
+                                            .set_dim({b, block_scale_dim_m, block_scale_dim_k})
+                                            .set_stride({block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1})
+                                            .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto block_descale_b = graph.tensor(fe::graph::Tensor_attributes()
+                                            .set_name("block_descale_b")
+                                            .set_data_type(datatype_block_scale)
+                                            .set_dim({b, block_scale_dim_k, block_scale_dim_n})
+                                            .set_stride({block_scale_dim_m * block_scale_dim_k, 1, block_scale_dim_k})
+                                            .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto dequantize_attr_a = fe::graph::Block_scale_dequantize_attributes().set_block_size({1, block_size});
+
+    auto dequan_tensor_a = graph.block_scale_dequantize(tensor_a, block_descale_a, dequantize_attr_a);
+
+    auto dequantize_attr_b = fe::graph::Block_scale_dequantize_attributes().set_block_size({block_size, 1});
+
+    auto dequan_tensor_b = graph.block_scale_dequantize(tensor_b, block_descale_b, dequantize_attr_b);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_c = graph.matmul(dequan_tensor_a, dequan_tensor_b, matmul_attributes);
+
+    auto quantize_attr =
+        fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(2).set_transpose(false);
+
+    auto [tensor_d, block_scale] = graph.block_scale_quantize(tensor_c, quantize_attr);
+
+    tensor_d->set_output(true).set_data_type(datatype_d);
+    block_scale->set_output(true)
+        .set_data_type(datatype_block_scale)
+        .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4);
+
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {tensor_a, tensor_a_gpu.devPtr},
+        {tensor_b, tensor_b_gpu.devPtr},
+        {block_descale_a, block_descale_a_gpu.devPtr},
+        {block_descale_b, block_descale_b_gpu.devPtr},
+        {tensor_d, tensor_d_gpu.devPtr},
+        {block_scale, block_scale_gpu.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Block Scale Matmul Swiglu", "[matmul][graph][FP4]") {
+#if (CUDNN_VERSION < 91500)
+    SKIP("Block Scale Matmul with Swiglu epilogue fusion is not supported in cudnn versions prior to 9.15.0");
+#endif
+
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("Hardware accelerated NVFP4/MXFP8 block scale matmul requires Blackwell and up");
+    }
+
+    auto test_params = GENERATE(TestParams(1,
+                                           256,
+                                           256,
+                                           256,
+                                           16,
+                                           cudnn_frontend::DataType_t::FP4_E2M1,
+                                           cudnn_frontend::DataType_t::FP4_E2M1,
+                                           cudnn_frontend::DataType_t::FP8_E4M3,
+                                           cudnn_frontend::DataType_t::FP4_E2M1));
+
+    int64_t const b = test_params.b;
+    int64_t const m = test_params.m;
+    int64_t const n = test_params.n;
+    int64_t const k = test_params.k;
+
+    auto datatype_a = test_params.datatype_a;
+    auto datatype_b = test_params.datatype_b;
+
+    auto datatype_block_scale = test_params.datatype_block_scale;
+
+    auto block_size = test_params.block_size;
+
+    auto datatype_d = test_params.datatype_d;
+
+    Surface<int8_t> tensor_a_gpu(div_up(b * m * k * cudnn_frontend::detail::get_element_size_in_bits(datatype_a), 8),
+                                 false);
+    Surface<int8_t> tensor_b0_gpu(div_up(b * k * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_b), 8),
+                                  false);
+    Surface<int8_t> tensor_b1_gpu(div_up(b * k * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_b), 8),
+                                  false);
+
+    static constexpr int indestructible_128x4_block_m_n = 128;
+    static constexpr int indestructible_128x4_block_k   = 4;
+
+    int64_t block_scale_dim_m = div_up(m, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_n = div_up(n, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_k =
+        div_up(div_up(k, block_size), indestructible_128x4_block_k) * indestructible_128x4_block_k;
+
+    Surface<int8_t> block_descale_a_gpu(
+        div_up(b * block_scale_dim_m * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+    Surface<int8_t> block_descale_b0_gpu(
+        div_up(b * block_scale_dim_n * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+    Surface<int8_t> block_descale_b1_gpu(
+        div_up(b * block_scale_dim_n * block_scale_dim_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale),
+               8),
+        false);
+
+    Surface<int8_t> tensor_d_gpu(div_up(b * m * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_d), 8),
+                                 false);
+
+    fe::graph::Graph graph{};
+
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT);
+    graph.set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_a = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_a")
+                                     .set_data_type(datatype_a)
+                                     .set_dim({b, m, k})
+                                     .set_stride({m * k, k, 1}));
+
+    auto tensor_b0 = graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("tensor_b")
+                                      .set_data_type(datatype_b)
+                                      .set_dim({b, k, n})
+                                      .set_stride({k * n, 1, k}));
+
+    auto tensor_b1 = graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("tensor_b")
+                                      .set_data_type(datatype_b)
+                                      .set_dim({b, k, n})
+                                      .set_stride({k * n, 1, k}));
+
+    auto block_descale_a = graph.tensor(fe::graph::Tensor_attributes()
+                                            .set_name("block_descale_a")
+                                            .set_data_type(datatype_block_scale)
+                                            .set_dim({b, block_scale_dim_m, block_scale_dim_k})
+                                            .set_stride({block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1})
+                                            .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto block_descale_b0 = graph.tensor(fe::graph::Tensor_attributes()
+                                             .set_name("block_descale_b")
+                                             .set_data_type(datatype_block_scale)
+                                             .set_dim({b, block_scale_dim_k, block_scale_dim_n})
+                                             .set_stride({block_scale_dim_m * block_scale_dim_k, 1, block_scale_dim_k})
+                                             .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto block_descale_b1 = graph.tensor(fe::graph::Tensor_attributes()
+                                             .set_name("block_descale_b")
+                                             .set_data_type(datatype_block_scale)
+                                             .set_dim({b, block_scale_dim_k, block_scale_dim_n})
+                                             .set_stride({block_scale_dim_m * block_scale_dim_k, 1, block_scale_dim_k})
+                                             .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto dequantize_attr_a = fe::graph::Block_scale_dequantize_attributes().set_block_size({1, block_size});
+
+    auto dequan_tensor_a = graph.block_scale_dequantize(tensor_a, block_descale_a, dequantize_attr_a);
+
+    auto dequantize_attr_b0 = fe::graph::Block_scale_dequantize_attributes().set_block_size({block_size, 1});
+
+    auto dequan_tensor_b0 = graph.block_scale_dequantize(tensor_b0, block_descale_b0, dequantize_attr_b0);
+
+    auto dequantize_attr_b1 = fe::graph::Block_scale_dequantize_attributes().set_block_size({block_size, 1});
+
+    auto dequan_tensor_b1 = graph.block_scale_dequantize(tensor_b1, block_descale_b1, dequantize_attr_b1);
+
+    auto matmul0_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto matmul1_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_c0 = graph.matmul(dequan_tensor_a, dequan_tensor_b0, matmul0_attributes);
+
+    auto tensor_c1 = graph.matmul(dequan_tensor_a, dequan_tensor_b1, matmul1_attributes);
+
+    auto swish_attributes = fe::graph::Pointwise_attributes()
+                                .set_name("swish")
+                                .set_mode(fe::PointwiseMode_t::SWISH_FWD)
+                                .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_after_swish = graph.pointwise(tensor_c0, swish_attributes);
+
+    auto mul_attributes = fe::graph::Pointwise_attributes()
+                              .set_name("mul")
+                              .set_mode(fe::PointwiseMode_t::MUL)
+                              .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_d = graph.pointwise(tensor_after_swish, tensor_c1, mul_attributes);
+
+    tensor_d->set_output(true).set_data_type(datatype_d);
+
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {tensor_a, tensor_a_gpu.devPtr},
+        {tensor_b0, tensor_b0_gpu.devPtr},
+        {tensor_b1, tensor_b1_gpu.devPtr},
+        {block_descale_a, block_descale_a_gpu.devPtr},
+        {block_descale_b0, block_descale_b0_gpu.devPtr},
+        {block_descale_b1, block_descale_b1_gpu.devPtr},
+        {tensor_d, tensor_d_gpu.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Blackwell Block Scale Matmul dynamic shape overrides", "[matmul][graph][dynamic_shape]") {
+#if (CUDNN_VERSION < 91800)
+    SKIP("Dynamic shape with overrides is not supported in cudnn versions prior to 9.18.0");
+#endif
+
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("Hardware accelerated NVFP4/MXFP8 block scale matmul requires Blackwell and up");
+    }
+
+    namespace fe = cudnn_frontend;
+
+    constexpr int A_UID    = 1;
+    constexpr int SF_A_UID = 2;
+    constexpr int B_UID    = 3;
+    constexpr int SF_B_UID = 4;
+    constexpr int C_UID    = 5;
+
+    static constexpr int indestructible_128x4_block_m_n = 128;
+    static constexpr int indestructible_128x4_block_k   = 4;
+
+    int block_size = 16;
+
+    struct matmul_shapes {
+        int64_t b, m, n, k;
+    };
+
+    matmul_shapes matmul_cache_shape     = {1, 1024, 1024, 1024};
+    matmul_shapes matmul_dynamic_shape[] = {
+        {2, 1024, 1024, 1024},
+        {2, 2048, 2048, 2048},
+    };
+
+    constexpr int matmul_dynamic_shape_count = sizeof(matmul_dynamic_shape) / sizeof(matmul_cache_shape);
+
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // build graph and execution plan with a fake shape
+    auto graph = std::make_shared<fe::graph::Graph>();
+
+    graph->set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT)
+        .set_dynamic_shape_enabled(true);  // must be set true for dynamic shape
+
+    int64_t block_scale_dim_m_cache =
+        div_up(matmul_cache_shape.m, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_n_cache =
+        div_up(matmul_cache_shape.n, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+    int64_t block_scale_dim_k_cache =
+        div_up(div_up(matmul_cache_shape.k, block_size), indestructible_128x4_block_k) * indestructible_128x4_block_k;
+
+    auto A = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("A")
+                               .set_uid(A_UID)
+                               .set_dim({matmul_cache_shape.b, matmul_cache_shape.m, matmul_cache_shape.k})
+                               .set_stride({matmul_cache_shape.m * matmul_cache_shape.k, matmul_cache_shape.k, 1})
+                               .set_data_type(fe::DataType_t::FP4_E2M1));
+
+    auto SF_A =
+        graph->tensor(fe::graph::Tensor_attributes()
+                          .set_name("SF_A")
+                          .set_uid(SF_A_UID)
+                          .set_dim({matmul_cache_shape.b, block_scale_dim_m_cache, block_scale_dim_k_cache})
+                          .set_stride({block_scale_dim_m_cache * block_scale_dim_k_cache, block_scale_dim_k_cache, 1})
+                          .set_data_type(fe::DataType_t::FP8_E4M3)
+                          .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto dequantize_attr_a = fe::graph::Block_scale_dequantize_attributes().set_block_size({1, block_size});
+    auto dequan_tensor_a   = graph->block_scale_dequantize(A, SF_A, dequantize_attr_a);
+
+    auto B = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("B")
+                               .set_uid(B_UID)
+                               .set_dim({matmul_cache_shape.b, matmul_cache_shape.k, matmul_cache_shape.n})
+                               .set_stride({matmul_cache_shape.n * matmul_cache_shape.k, 1, matmul_cache_shape.k})
+                               .set_data_type(fe::DataType_t::FP4_E2M1));
+
+    auto SF_B =
+        graph->tensor(fe::graph::Tensor_attributes()
+                          .set_name("SF_B")
+                          .set_uid(SF_B_UID)
+                          .set_dim({matmul_cache_shape.b, block_scale_dim_k_cache, block_scale_dim_n_cache})
+                          .set_stride({block_scale_dim_n_cache * block_scale_dim_k_cache, 1, block_scale_dim_k_cache})
+                          .set_data_type(fe::DataType_t::FP8_E4M3)
+                          .set_reordering_type(cudnn_frontend::TensorReordering_t::F8_128x4));
+
+    auto dequantize_attr_b = fe::graph::Block_scale_dequantize_attributes().set_block_size({block_size, 1});
+    auto dequan_tensor_b   = graph->block_scale_dequantize(B, SF_B, dequantize_attr_b);
+
+    auto C = graph->matmul(
+        dequan_tensor_a, dequan_tensor_b, fe::graph::Matmul_attributes().set_compute_data_type(fe::DataType_t::FLOAT));
+    C->set_uid(C_UID).set_output(true).set_data_type(fe::DataType_t::BFLOAT16);
+
+    // For dynamic shape, recommend to query fallback plan to get a general good performance
+    // Heuristics Mode A is recommended if the dynamic problem shapes are similar in size
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::FALLBACK}).is_good());
+
+    // run graph with dynamic shapes
+    for (int idx_shape = 0; idx_shape < matmul_dynamic_shape_count; ++idx_shape) {
+        int64_t block_scale_dim_m =
+            div_up(matmul_dynamic_shape[idx_shape].m, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+        int64_t block_scale_dim_n =
+            div_up(matmul_dynamic_shape[idx_shape].n, indestructible_128x4_block_m_n) * indestructible_128x4_block_m_n;
+        int64_t block_scale_dim_k =
+            div_up(div_up(matmul_dynamic_shape[idx_shape].k, block_size), indestructible_128x4_block_k) *
+            indestructible_128x4_block_k;
+
+        std::vector<int64_t> override_uids                = {A_UID, SF_A_UID, B_UID, SF_B_UID, C_UID};
+        std::vector<std::vector<int64_t>> override_shapes = {
+            {matmul_dynamic_shape[idx_shape].b, matmul_dynamic_shape[idx_shape].m, matmul_dynamic_shape[idx_shape].k},
+            {matmul_dynamic_shape[idx_shape].b, block_scale_dim_m, block_scale_dim_k},
+            {matmul_dynamic_shape[idx_shape].b, matmul_dynamic_shape[idx_shape].k, matmul_dynamic_shape[idx_shape].n},
+            {matmul_dynamic_shape[idx_shape].b, block_scale_dim_k, block_scale_dim_n},
+            {matmul_dynamic_shape[idx_shape].b, matmul_dynamic_shape[idx_shape].m, matmul_dynamic_shape[idx_shape].n}};
+        std::vector<std::vector<int64_t>> override_strides = {
+            {matmul_dynamic_shape[idx_shape].m * matmul_dynamic_shape[idx_shape].k,
+             matmul_dynamic_shape[idx_shape].k,
+             1},
+            {block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1},
+            {matmul_dynamic_shape[idx_shape].n * matmul_dynamic_shape[idx_shape].k,
+             1,
+             matmul_dynamic_shape[idx_shape].k},
+            {block_scale_dim_n * block_scale_dim_k, 1, block_scale_dim_k},
+            {matmul_dynamic_shape[idx_shape].m * matmul_dynamic_shape[idx_shape].n,
+             matmul_dynamic_shape[idx_shape].n,
+             1}};
+
+        Surface<int8_t> A_gpu(div_up(matmul_dynamic_shape[idx_shape].b * matmul_dynamic_shape[idx_shape].m *
+                                         matmul_dynamic_shape[idx_shape].k *
+                                         cudnn_frontend::detail::get_element_size_in_bits(fe::DataType_t::FP4_E2M1),
+                                     8),
+                              false);
+        Surface<int8_t> SF_A_gpu(div_up(matmul_dynamic_shape[idx_shape].b * block_scale_dim_m * block_scale_dim_k *
+                                            cudnn_frontend::detail::get_element_size_in_bits(fe::DataType_t::FP8_E4M3),
+                                        8),
+                                 false);
+        Surface<int8_t> B_gpu(div_up(matmul_dynamic_shape[idx_shape].b * matmul_dynamic_shape[idx_shape].k *
+                                         matmul_dynamic_shape[idx_shape].n *
+                                         cudnn_frontend::detail::get_element_size_in_bits(fe::DataType_t::FP4_E2M1),
+                                     8),
+                              false);
+        Surface<int8_t> SF_B_gpu(div_up(matmul_dynamic_shape[idx_shape].b * block_scale_dim_k * block_scale_dim_n *
+                                            cudnn_frontend::detail::get_element_size_in_bits(fe::DataType_t::FP8_E4M3),
+                                        8),
+                                 false);
+        Surface<int8_t> C_gpu(div_up(matmul_dynamic_shape[idx_shape].b * matmul_dynamic_shape[idx_shape].m *
+                                         matmul_dynamic_shape[idx_shape].n *
+                                         cudnn_frontend::detail::get_element_size_in_bits(fe::DataType_t::BFLOAT16),
+                                     8),
+                              false);
+
+        std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {{A_UID, A_gpu.devPtr},
+                                                                                       {SF_A_UID, SF_A_gpu.devPtr},
+                                                                                       {B_UID, B_gpu.devPtr},
+                                                                                       {SF_B_UID, SF_B_gpu.devPtr},
+                                                                                       {C_UID, C_gpu.devPtr}};
+
+        int64_t workspace_size = 0;
+        REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+        Surface<int8_t> workspace(workspace_size, false);
+
+        REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr, override_uids, override_shapes, override_strides)
+                    .is_good());
+
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
+}  // namespace BlackwellNVFP4MXFP8BlockScaleMatmul
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/complex_fp32_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/complex_fp32_matmul.cpp
new file mode 100644
index 00000000..7db6033a
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/complex_fp32_matmul.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Complex FP32 Matmul", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000 || cudnnGetVersion() < 91400) {
+        SKIP("Test requires cuda toolkit 12.0 or above and cudnn version 9.14.0 or above");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    // COMPLEX_FP32 datatype is a {float, float} in memory
+    Surface<int8_t> A_gpu(2 * sizeof(float) * b * m * k, false);
+
+    Surface<int8_t> B_gpu(2 * sizeof(float) * b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::COMPLEX_FP32);
+    auto A = graph.tensor(A_attributes);
+
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, 1, k})
+                            .set_data_type(fe::DataType_t::COMPLEX_FP32);
+    auto B = graph.tensor(B_attributes);
+
+    // Add MATMUL operation
+    auto matmul_attributes = cudnn_frontend::graph::Matmul_attributes()
+                                 .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT)
+                                 .set_name("GEMM");
+
+    auto C = graph.matmul(A, B, matmul_attributes);
+    C->set_output(true).set_data_type(cudnn_frontend::DataType_t::COMPLEX_FP32);
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    if (cudnnGetVersion() >= 9140) {
+        REQUIRE(graph.check_support().is_good());
+    } else {
+        SKIP("complex gemm not supported pre-cudnn-9.14.0");
+    }
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Run cudnn graph
+    Surface<float> C_gpu(2 * sizeof(float) * b * m * n, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+
+    std::cout << graph.print() << std::endl;
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/fp8_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/fp8_matmul.cpp
new file mode 100644
index 00000000..3402e5f8
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/fp8_matmul.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Matmul fp8 precision", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    if ((is_hopper_arch() && cudnnGetVersion() >= 90000) == false) {
+        SKIP("FP8 gemm not supported pre-Hopper or pre-cudnn-9.0.0");
+    }
+
+    namespace fe = cudnn_frontend;
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors with int8_t as proxy for fp8
+    Surface<int8_t> A_gpu(b * m * k, false);
+    Surface<int8_t> B_gpu(b * k * n, false);
+
+    Surface<float> A_descale_gpu(1, false);
+    Surface<float> B_descale_gpu(1, false);
+
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::FP8_E4M3);
+    auto A = graph.tensor(A_attributes);
+
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, 1, k})
+                            .set_data_type(fe::DataType_t::FP8_E4M3);
+    auto B = graph.tensor(B_attributes);
+
+    auto A_descale_attributes =
+        fe::graph::Tensor_attributes().set_name("A").set_dim({1, 1, 1}).set_stride({1, 1, 1}).set_data_type(
+            fe::DataType_t::FLOAT);
+    auto B_descale_attributes =
+        fe::graph::Tensor_attributes().set_name("B").set_dim({1, 1, 1}).set_stride({1, 1, 1}).set_data_type(
+            fe::DataType_t::FLOAT);
+
+    auto A_descale = graph.tensor(A_descale_attributes);
+    auto B_descale = graph.tensor(B_descale_attributes);
+
+    auto matmul_attributes =
+        // fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.matmul(A, B, matmul_attributes);
+    C->set_data_type(fe::DataType_t::FLOAT);
+
+    // Add scale_A operation
+    auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                               //    .set_name("pw0_Mul")
+                               .set_mode(fe::PointwiseMode_t::MUL)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C_after_pw_0 = graph.pointwise(C, A_descale, pw_0_attributes);
+    C_after_pw_0->set_data_type(fe::DataType_t::FLOAT);
+
+    // Add descale_B operation
+    auto pw_1_attributes = fe::graph::Pointwise_attributes()
+                               //    .set_name("pw1_Mul")
+                               .set_mode(fe::PointwiseMode_t::MUL)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C_after_pw_1 = graph.pointwise(C_after_pw_0, B_descale, pw_1_attributes);
+    C_after_pw_1->set_output(true).set_data_type(fe::DataType_t::BFLOAT16);
+
+    std::cout << graph << std::endl;
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support(handle).is_good());
+
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<float> C_gpu(b * m * n, false);
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr},
+        {B, B_gpu.devPtr},
+        {C_after_pw_1, C_gpu.devPtr},
+        {A_descale, A_descale_gpu.devPtr},
+        {B_descale, B_descale_gpu.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/general_block_scale_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/general_block_scale_matmul.cpp
new file mode 100644
index 00000000..dc48b84d
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/general_block_scale_matmul.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <catch2/generators/catch_generators.hpp>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+#include <cudnn_frontend_utils.h>
+
+namespace GeneralBlockScaleMatmul {
+
+struct TestParams {
+    int64_t b = -1;
+    int64_t m = -1;
+    int64_t n = -1;
+    int64_t k = -1;
+
+    int32_t block_size_a_m = -1;
+    int32_t block_size_a_k = -1;
+    int32_t block_size_b_n = -1;
+    int32_t block_size_b_k = -1;
+
+    cudnn_frontend::DataType_t datatype_a = cudnn_frontend::DataType_t::NOT_SET;
+    cudnn_frontend::DataType_t datatype_b = cudnn_frontend::DataType_t::NOT_SET;
+
+    cudnn_frontend::DataType_t datatype_block_scale_a = cudnn_frontend::DataType_t::NOT_SET;
+    cudnn_frontend::DataType_t datatype_block_scale_b = cudnn_frontend::DataType_t::NOT_SET;
+
+    cudnn_frontend::DataType_t after_dequant_datatype_a = cudnn_frontend::DataType_t::NOT_SET;
+    cudnn_frontend::DataType_t after_dequant_datatype_b = cudnn_frontend::DataType_t::NOT_SET;
+
+    cudnn_frontend::DataType_t datatype_d = cudnn_frontend::DataType_t::NOT_SET;
+
+    cudnn_frontend::DataType_t compute_math_precision = cudnn_frontend::DataType_t::NOT_SET;
+
+    TestParams(int64_t b_,
+               int64_t m_,
+               int64_t n_,
+               int64_t k_,
+               int32_t block_size_a_m_,
+               int32_t block_size_a_k_,
+               int32_t block_size_b_k_,
+               int32_t block_size_b_n_,
+               cudnn_frontend::DataType_t datatype_a_,
+               cudnn_frontend::DataType_t datatype_b_,
+               cudnn_frontend::DataType_t datatype_block_scale_a_,
+               cudnn_frontend::DataType_t datatype_block_scale_b_,
+               cudnn_frontend::DataType_t after_dequant_datatype_a_,
+               cudnn_frontend::DataType_t after_dequant_datatype_b_,
+               cudnn_frontend::DataType_t datatype_d_,
+               cudnn_frontend::DataType_t compute_math_precision_) {
+        b = b_;
+        m = m_;
+        n = n_;
+        k = k_;
+
+        block_size_a_m = block_size_a_m_;
+        block_size_a_k = block_size_a_k_;
+        block_size_b_k = block_size_b_k_;
+        block_size_b_n = block_size_b_n_;
+
+        datatype_a               = datatype_a_;
+        datatype_b               = datatype_b_;
+        datatype_block_scale_a   = datatype_block_scale_a_;
+        datatype_block_scale_b   = datatype_block_scale_b_;
+        after_dequant_datatype_a = after_dequant_datatype_a_;
+        after_dequant_datatype_b = after_dequant_datatype_b_;
+        datatype_d               = datatype_d_;
+        compute_math_precision   = compute_math_precision_;
+    }
+};
+
+TEST_CASE("General Block Scale Matmul", "[matmul][graph]") {
+    namespace fe = cudnn_frontend;
+
+#if (CUDNN_VERSION < 91100)
+    SKIP("General matmul with block scaling is not supported in cudnn versions prior to 9.11.0");
+#endif
+
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    auto test_params = GENERATE(
+        // General fp16 block scale matmul with 1x128 & 128x128 block size
+        TestParams(2,
+                   512,
+                   512,
+                   512,
+                   1,
+                   128,
+                   128,
+                   128,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF),
+        // Int4 WoQ Matmul with 1x128 & 128x1 block size
+        TestParams(2,
+                   512,
+                   512,
+                   512,
+                   1,
+                   128,
+                   128,
+                   1,
+                   cudnn_frontend::DataType_t::INT4,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::FLOAT,
+                   cudnn_frontend::DataType_t::FLOAT,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF,
+                   cudnn_frontend::DataType_t::HALF),
+        // fp8 block scale matmul with non-classical block size
+        TestParams(2,
+                   512,
+                   512,
+                   512,
+                   47,
+                   32,
+                   32,
+                   17,
+                   cudnn_frontend::DataType_t::BFLOAT16,
+                   cudnn_frontend::DataType_t::BFLOAT16,
+                   cudnn_frontend::DataType_t::FLOAT,
+                   cudnn_frontend::DataType_t::FLOAT,
+                   cudnn_frontend::DataType_t::BFLOAT16,
+                   cudnn_frontend::DataType_t::BFLOAT16,
+                   cudnn_frontend::DataType_t::FLOAT,
+                   cudnn_frontend::DataType_t::FLOAT));
+
+    auto b = test_params.b;
+    auto m = test_params.m;
+    auto n = test_params.n;
+    auto k = test_params.k;
+
+    auto block_size_a_m = test_params.block_size_a_m;
+    auto block_size_a_k = test_params.block_size_a_k;
+    auto block_size_b_k = test_params.block_size_b_k;
+    auto block_size_b_n = test_params.block_size_b_n;
+
+    auto datatype_a = test_params.datatype_a;
+    auto datatype_b = test_params.datatype_b;
+
+    auto datatype_block_scale_a = test_params.datatype_block_scale_a;
+    auto datatype_block_scale_b = test_params.datatype_block_scale_b;
+
+    auto after_dequant_datatype_a = test_params.after_dequant_datatype_a;
+    auto after_dequant_datatype_b = test_params.after_dequant_datatype_b;
+
+    auto datatype_d = test_params.datatype_d;
+
+    auto compute_math_precision = test_params.compute_math_precision;
+
+    Surface<int8_t> tensor_a_gpu(div_up(b * m * k * cudnn_frontend::detail::get_element_size_in_bits(datatype_a), 8),
+                                 false);
+    Surface<int8_t> tensor_b_gpu(div_up(b * k * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_b), 8),
+                                 false);
+
+    int64_t block_scale_dim_a_m = div_up(m, block_size_a_m);
+    int64_t block_scale_dim_a_k = div_up(k, block_size_a_k);
+    int64_t block_scale_dim_b_k = div_up(k, block_size_b_k);
+    int64_t block_scale_dim_b_n = div_up(n, block_size_b_n);
+
+    Surface<int8_t> block_descale_a_gpu(
+        div_up(b * block_scale_dim_a_m * block_scale_dim_a_k *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale_a),
+               8),
+        false);
+    Surface<int8_t> block_descale_b_gpu(
+        div_up(b * block_scale_dim_b_k * block_scale_dim_b_n *
+                   cudnn_frontend::detail::get_element_size_in_bits(datatype_block_scale_b),
+               8),
+        false);
+
+    Surface<int8_t> tensor_d_gpu(div_up(b * m * n * cudnn_frontend::detail::get_element_size_in_bits(datatype_d), 8),
+                                 false);
+
+    fe::graph::Graph graph{};
+
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT);
+    graph.set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto tensor_a = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_a")
+                                     .set_data_type(datatype_a)
+                                     .set_dim({b, m, k})
+                                     .set_stride({m * k, k, 1}));
+
+    auto tensor_b = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("tensor_b")
+                                     .set_data_type(datatype_b)
+                                     .set_dim({b, k, n})
+                                     .set_stride({k * n, 1, k}));
+
+    auto block_descale_a =
+        graph.tensor(fe::graph::Tensor_attributes()
+                         .set_name("block_descale_a")
+                         .set_data_type(datatype_block_scale_a)
+                         .set_dim({b, block_scale_dim_a_m, block_scale_dim_a_k})
+                         .set_stride({block_scale_dim_a_m * block_scale_dim_a_k, block_scale_dim_a_k, 1}));
+
+    auto block_descale_b =
+        graph.tensor(fe::graph::Tensor_attributes()
+                         .set_name("block_descale_b")
+                         .set_data_type(datatype_block_scale_b)
+                         .set_dim({b, block_scale_dim_b_k, block_scale_dim_b_n})
+                         .set_stride({block_scale_dim_b_k * block_scale_dim_b_n, 1, block_scale_dim_b_k}));
+
+    auto dequantize_attr_a = fe::graph::Block_scale_dequantize_attributes()
+                                 .set_block_size({block_size_a_m, block_size_a_k})
+                                 .set_compute_data_type(compute_math_precision);
+
+    auto dequant_tensor_a = graph.block_scale_dequantize(tensor_a, block_descale_a, dequantize_attr_a);
+
+    auto dequantize_attr_b = fe::graph::Block_scale_dequantize_attributes()
+                                 .set_block_size({block_size_b_k, block_size_b_n})
+                                 .set_compute_data_type(compute_math_precision);
+
+    auto dequant_tensor_b = graph.block_scale_dequantize(tensor_b, block_descale_b, dequantize_attr_b);
+
+    // This explicit data type setting is necessary,
+    // otherwise float tensor core instructions will be utilized by default, causing unoptimized performance
+    // just explicitly set to cudnn_frontend::DataType_t::HALF if no idea what data type should use
+    dequant_tensor_a->set_data_type(after_dequant_datatype_a);
+    dequant_tensor_b->set_data_type(after_dequant_datatype_b);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(compute_math_precision);
+
+    auto tensor_d = graph.matmul(dequant_tensor_a, dequant_tensor_b, matmul_attributes);
+
+    tensor_d->set_data_type(datatype_d);
+    tensor_d->set_is_virtual(false);
+
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {tensor_a, tensor_a_gpu.devPtr},
+        {tensor_b, tensor_b_gpu.devPtr},
+        {block_descale_a, block_descale_a_gpu.devPtr},
+        {block_descale_b, block_descale_b_gpu.devPtr},
+        {tensor_d, tensor_d_gpu.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+}  // namespace GeneralBlockScaleMatmul
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/int8_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/int8_matmul.cpp
new file mode 100644
index 00000000..99193162
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/int8_matmul.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Int8 Matmul", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<int8_t> A_gpu(b * m * k, false);
+    // note this is a bf16 tensor, but half is used just for memory allocation
+    Surface<int8_t> B_gpu(b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::INT8);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, 1, k})
+                            .set_data_type(fe::DataType_t::INT8);
+    auto B = graph.tensor(B_attributes);
+
+    auto Bias_attributes = cudnn_frontend::graph::Tensor_attributes()
+                               .set_name("Bias")
+                               .set_dim({b, m, n})
+                               .set_data_type(cudnn_frontend::DataType_t::FLOAT)
+                               .set_stride({m * n, n, 1});
+    auto Bias = graph.tensor(Bias_attributes);
+
+    // Add MATMUL operation
+    auto matmul_attributes = cudnn_frontend::graph::Matmul_attributes()
+                                 .set_compute_data_type(cudnn_frontend::DataType_t::INT32)
+                                 .set_name("GEMM");
+    auto C = graph.matmul(A, B, matmul_attributes);
+    C->set_data_type(cudnn_frontend::DataType_t::FLOAT);
+
+    // Add ADD operation
+    auto add_attributes = cudnn_frontend::graph::Pointwise_attributes()
+                              .set_name("pw1_add")
+                              .set_mode(cudnn_frontend::PointwiseMode_t::ADD)
+                              .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
+    auto C_after_add = graph.pointwise(C, Bias, add_attributes);
+    C_after_add->set_output(true).set_data_type(cudnn_frontend::DataType_t::FLOAT);
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    if (check_device_arch_newer_than("ampere") && cudnnGetVersion() >= 8906) {
+        REQUIRE(graph.check_support().is_good());
+    } else {
+        SKIP("int8 gemm not supported pre-Ampere or pre-cudnn-8.9.6");
+    }
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Run cudnn graph
+    // note this is a bf16 tensor, but half is used just for memory allocation
+    Surface<float> C_gpu(b * m * n, false);
+    Surface<float> Bias_gpu(b * m * n, false);
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C_after_add, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}};
+
+    std::cout << graph.print() << std::endl;
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/matmuls.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/matmuls.cpp
new file mode 100644
index 00000000..8020b7a1
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/matmuls.cpp
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+void
+matmul_dynamic_shapes(bool use_abs = false, bool use_bias = false) {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // clang-format off
+    struct {
+        int64_t b,    m,    n,    k;
+    } matmul_shapes[] = {
+        {      16,   32,   32,  128},
+        {      16,   64,   64,  128},
+        {      16,   80,   80,  128},
+        {      32,  128,  128,  256},
+        {      32,   64,   64,  256},
+    };
+    // clang-format on
+
+    constexpr int matmul_shapes_count = sizeof(matmul_shapes) / sizeof(matmul_shapes[0]);
+    int64_t max_a_volume = 0, max_b_volume = 0, max_c_volume = 0, max_bias_volume = 0;
+    for (int idx_shape = 0; idx_shape < matmul_shapes_count; ++idx_shape) {
+        const auto& matmul_shape = matmul_shapes[idx_shape];
+        max_a_volume             = std::max(max_a_volume, matmul_shape.b * matmul_shape.m * matmul_shape.k);
+        max_b_volume             = std::max(max_b_volume, matmul_shape.b * matmul_shape.k * matmul_shape.n);
+        max_c_volume             = std::max(max_c_volume, matmul_shape.b * matmul_shape.m * matmul_shape.n);
+        max_bias_volume          = std::max(max_bias_volume, matmul_shape.b * matmul_shape.m);
+    }
+
+    auto kernel_cache = std::make_shared<fe::KernelCache>();
+
+    const auto build_new_graph = [&matmul_shapes, &kernel_cache, &use_abs, &use_bias](cudnnHandle_t handle,
+                                                                                      int idx_shape) {
+        const auto& matmul_shape = matmul_shapes[idx_shape];
+
+        // Make cudnn graph
+        fe::graph::Graph graph{};
+
+        graph.set_dynamic_shape_enabled(true).set_kernel_cache(kernel_cache);
+
+        // Create the two non-virtual input tensors A and B.
+        // There are read from global memory.
+        auto A_attributes = fe::graph::Tensor_attributes()
+                                .set_name("A")
+                                .set_dim({matmul_shape.b, matmul_shape.m, matmul_shape.k})
+                                .set_stride({matmul_shape.m * matmul_shape.k, matmul_shape.k, 1})
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto A = graph.tensor(A_attributes);
+
+        auto B_attributes = fe::graph::Tensor_attributes()
+                                .set_name("B")
+                                .set_dim({matmul_shape.b, matmul_shape.k, matmul_shape.n})
+                                .set_stride({matmul_shape.k * matmul_shape.n, matmul_shape.n, 1})
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto B = graph.tensor(B_attributes);
+
+        auto matmul_attributes = fe::graph::Matmul_attributes().set_compute_data_type(fe::DataType_t::FLOAT);
+
+        std::shared_ptr<fe::graph::Tensor_attributes> C;
+        std::shared_ptr<fe::graph::Tensor_attributes> Bias;
+
+        if (use_abs) {
+            // Add abs operation
+            auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                                       .set_name("pw0_Abs")
+                                       .set_mode(fe::PointwiseMode_t::ABS)
+                                       .set_compute_data_type(fe::DataType_t::FLOAT);
+
+            auto A_after_pw_0 = graph.pointwise(A, pw_0_attributes);
+            A_after_pw_0->set_data_type(fe::DataType_t::BFLOAT16);
+
+            C = graph.matmul(A_after_pw_0, B, matmul_attributes);
+        } else if (use_bias) {
+            // Create Bias vector
+            auto Bias_attributes = fe::graph::Tensor_attributes()
+                                       .set_name("Bias")
+                                       .set_dim({matmul_shape.b, matmul_shape.m, 1})
+                                       .set_stride({matmul_shape.m, 1, 1})
+                                       .set_data_type(fe::DataType_t::BFLOAT16);
+            Bias = graph.tensor(Bias_attributes);
+
+            // Add ADD operation
+            auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                                       .set_name("pw0_Add")
+                                       .set_mode(fe::PointwiseMode_t::ADD)
+                                       .set_compute_data_type(fe::DataType_t::FLOAT);
+
+            auto A_after_pw_0 = graph.pointwise(A, Bias, pw_0_attributes);
+            A_after_pw_0->set_data_type(fe::DataType_t::BFLOAT16);
+
+            C = graph.matmul(A_after_pw_0, B, matmul_attributes);
+        } else {
+            C = graph.matmul(A, B, matmul_attributes);
+        }
+        C->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+        std::cout << graph << std::endl;
+        auto status = graph.validate();
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Dynamic shapes not supported pre 9.4");
+        }
+
+        status = graph.build_operation_graph(handle);
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Kernel cache not supported pre 9.4");
+        }
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph.check_support().is_good());
+
+        REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+        return std::make_tuple(graph, A, B, C, Bias);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    for (int idx_shape = 0; idx_shape < matmul_shapes_count; idx_shape++) {
+        auto [graph, A, B, C, Bias] = build_new_graph(handle, idx_shape);
+        // Initialize input tensors
+        Surface<half> A_gpu(max_a_volume, false);
+        Surface<half> B_gpu(max_b_volume, false);
+        Surface<float> C_gpu(max_c_volume, false);
+        Surface<half> Bias_gpu(max_bias_volume, false);
+        Surface<int8_t> workspace(graph.get_workspace_size(), false);
+
+        std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack;
+        if (use_bias) {
+            variant_pack = {{A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}};
+        } else {
+            variant_pack = {{A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+        }
+        REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    }
+}
+
+TEST_CASE("Matmul dynamic shape", "[matmul][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    matmul_dynamic_shapes(false, false);  // Matmul dynamic shape, no abs or bias
+}
+
+TEST_CASE("Abs + Matmul dynamic shape", "[matmul][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    matmul_dynamic_shapes(true, false);  // Matmul with abs
+}
+
+TEST_CASE("Bias + Matmul dynamic shape", "[matmul][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    matmul_dynamic_shapes(false, true);  // Matmul with bias
+}
+
+TEST_CASE("Matmul", "[matmul][graph]") {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, n, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto B = graph.tensor(B_attributes);
+
+    auto matmul_attributes = fe::graph::Matmul_attributes().set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C                 = graph.matmul(A, B, matmul_attributes);
+    C->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    std::cout << graph << std::endl;
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    graph.deselect_engines({"eng4_"});
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+    // Run cudnn graph
+    Surface<float> C_gpu(b * m * n, false);
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Abs + Matmul", "[matmul][graph]") {
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, n, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto B = graph.tensor(B_attributes);
+
+    // Add abs operation
+    auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                               .set_name("pw0_Abs")
+                               .set_mode(fe::PointwiseMode_t::ABS)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto A_after_pw_0 = graph.pointwise(A, pw_0_attributes);
+    A_after_pw_0->set_data_type(fe::DataType_t::BFLOAT16);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.matmul(A_after_pw_0, B, matmul_attributes);
+    C->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Run cudnn graph
+    Surface<float> C_gpu(b * m * n, false);
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Bias + Matmul", "[matmul][graph]") {
+    namespace fe = cudnn_frontend;
+
+    if (cudnnGetVersion() < 8600) {
+        SKIP("Test requires cuDNN version 8.6.0 or above");
+        return;
+    }
+
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+    Surface<half> Bias_gpu(b * m * 1, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, n, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto B = graph.tensor(B_attributes);
+
+    // Create Bias vector
+    auto Bias_attributes =
+        fe::graph::Tensor_attributes().set_name("Bias").set_dim({b, m, 1}).set_stride({m, 1, 1}).set_data_type(
+            fe::DataType_t::BFLOAT16);
+    auto Bias = graph.tensor(Bias_attributes);
+
+    // Add ADD operation
+    auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                               .set_name("pw0_Add")
+                               .set_mode(fe::PointwiseMode_t::ADD)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto A_after_pw_0 = graph.pointwise(A, Bias, pw_0_attributes);
+    A_after_pw_0->set_data_type(fe::DataType_t::BFLOAT16);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.matmul(A_after_pw_0, B, matmul_attributes);
+    C->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    int64_t plan_count = graph.get_execution_plan_count();
+
+    std::vector<int64_t> successful_plans;
+    std::vector<int64_t> unsuccessful_plans;
+    for (int64_t plan_index = 0; plan_index < plan_count; plan_index++) {
+        bool did_build_successfully = graph.build_plan_at_index(plan_index).is_good();
+        if (did_build_successfully) {
+            successful_plans.push_back(plan_index);
+        } else {
+            unsuccessful_plans.push_back(plan_index);
+        }
+    }
+
+    // Run cudnn graph
+    Surface<float> C_gpu(b * m * n, false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}, {Bias, Bias_gpu.devPtr}};
+
+    // Run a unsuccessful engine and except error
+    std::vector<int64_t> random_unsuccessful;
+    std::sample(unsuccessful_plans.begin(),
+                unsuccessful_plans.end(),
+                std::back_inserter(random_unsuccessful),
+                1,
+                std::mt19937{std::random_device{}()});
+    if (random_unsuccessful.size()) {
+        REQUIRE(graph.execute_plan_at_index(handle, variant_pack, nullptr, random_unsuccessful.front()).is_bad());
+    }
+
+    // Run a successful engine and except success
+    std::vector<int64_t> random_successful;
+    std::sample(successful_plans.begin(),
+                successful_plans.end(),
+                std::back_inserter(random_successful),
+                1,
+                std::mt19937{std::random_device{}()});
+    Surface<int8_t> workspace(graph.get_workspace_size_plan_at_index(random_successful.front()), false);
+    REQUIRE(graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, random_successful.front()).is_good());
+}
+
+TEST_CASE("Matmul SBR Graph", "[matmul][graph]") {
+    namespace fe = cudnn_frontend;
+
+    if (cudnnGetVersion() < 8600) {
+        SKIP("Test requires cuDNN version 8.6.0 or above");
+        return;
+    }
+
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    auto b = 4;
+    auto m = 16;
+    auto k = 64;
+    auto n = 32;
+
+    using graph_and_tensors = std::tuple<std::shared_ptr<fe::graph::Graph>,
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // A
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // B
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // bias
+                                         std::shared_ptr<fe::graph::Tensor_attributes>,  // S
+                                         std::shared_ptr<fe::graph::Tensor_attributes>   // O
+                                         >;
+
+    std::unordered_map<std::size_t, graph_and_tensors> user_maintained_cache;
+
+    auto lookup_cache_or_build_graph =
+        [b, m, n, k, &user_maintained_cache](
+            cudnnHandle_t handle, void* A_ptr, void* B_ptr, void* scale_ptr, void* bias_ptr, void* O_ptr) {
+            auto graph = std::make_shared<fe::graph::Graph>();
+            graph->set_io_data_type(fe::DataType_t::HALF)
+                .set_intermediate_data_type(fe::DataType_t::FLOAT)
+                .set_compute_data_type(fe::DataType_t::FLOAT);
+
+            auto A = graph->tensor(
+                fe::graph::Tensor_attributes().set_name("A").set_dim({b, m, k}).set_stride({m * k, 1, m}));
+
+            auto B = graph->tensor(
+                fe::graph::Tensor_attributes().set_name("B").set_dim({b, k, n}).set_stride({n * k, 1, k}));
+
+            fe::graph::Matmul_attributes matmul;
+            auto C = graph->matmul(A, B, matmul);
+
+            auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+            auto S             = graph->tensor(
+                fe::graph::Tensor_attributes().set_name("scale").set_dim({b, m, n}).set_stride({m * n, n, 1}));
+            auto scale_output = graph->pointwise(C, S, scale_options);
+
+            auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+            auto bias         = graph->tensor_like(S);
+            bias->set_name("bias");
+            auto bias_output = graph->pointwise(scale_output, bias, bias_options);
+
+            auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+            auto O            = graph->pointwise(bias_output, relu_options);
+            O->set_output(true);
+
+            REQUIRE(graph->validate().is_good());
+
+            auto key = graph->key();
+
+            auto it = user_maintained_cache.find(key);
+
+            if (it != user_maintained_cache.end()) {
+                return it->second;
+            }
+
+            REQUIRE(graph->build_operation_graph(handle).is_good());
+
+            REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+            REQUIRE(graph->check_support().is_good());
+
+            REQUIRE(graph->build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+            Surface<int8_t> autotune_workspace(graph->get_autotune_workspace_size(), false);
+
+            std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+                {A, A_ptr}, {B, B_ptr}, {S, scale_ptr}, {bias, bias_ptr}, {O, O_ptr}};
+
+            REQUIRE(graph->autotune(handle, variant_pack, autotune_workspace.devPtr).is_good());
+
+            (void)variant_pack;
+            user_maintained_cache.insert({key, std::make_tuple(graph, A, B, bias, S, O)});
+
+            return std::make_tuple(graph, A, B, bias, S, O);
+        };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    Surface<half> x_tensor(4 * 16 * 64, false);
+    Surface<half> w_tensor(4 * 64 * 32, false);
+    Surface<half> s_tensor(4 * 16 * 32, false);
+    Surface<half> b_tensor(4 * 16 * 32, false);
+    Surface<half> y_tensor(4 * 16 * 32, false);
+
+    auto [graph, A, B, bias, scale, O] = lookup_cache_or_build_graph(
+        handle, x_tensor.devPtr, w_tensor.devPtr, s_tensor.devPtr, b_tensor.devPtr, y_tensor.devPtr);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, x_tensor.devPtr},
+                                                                                             {B, w_tensor.devPtr},
+                                                                                             {scale, s_tensor.devPtr},
+                                                                                             {bias, b_tensor.devPtr},
+                                                                                             {O, y_tensor.devPtr}};
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Matmul with restricted shared memory", "[matmul][graph]") {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 1;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 32;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, n, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto B = graph.tensor(B_attributes);
+
+    auto matmul_attributes = fe::graph::Matmul_attributes().set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C                 = graph.matmul(A, B, matmul_attributes);
+    C->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    std::cout << graph << std::endl;
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    graph.deselect_shared_mem_greater_than(256 * 1024);
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Run cudnn graph
+    Surface<float> C_gpu(b * m * n, false);
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Matmul dynamic shape overrides", "[matmul][graph][dynamic_shape]") {
+#if (CUDNN_VERSION < 91800)
+    SKIP("Dynamic shape with overrides is not supported in cudnn versions prior to 9.18.0");
+#endif
+
+    namespace fe = cudnn_frontend;
+
+    constexpr int A_UID = 1;
+    constexpr int B_UID = 2;
+    constexpr int C_UID = 3;
+
+    struct matmul_shapes {
+        int64_t b, m, n, k;
+    };
+
+    matmul_shapes matmul_cache_shape     = {1, 1024, 1024, 1024};
+    matmul_shapes matmul_dynamic_shape[] = {
+        {2, 1024, 1024, 1024},
+        {2, 2048, 2048, 2048},
+    };
+
+    constexpr int matmul_dynamic_shape_count = sizeof(matmul_dynamic_shape) / sizeof(matmul_cache_shape);
+
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // build graph and execution plan with a fake shape
+    auto graph = std::make_shared<fe::graph::Graph>();
+
+    graph->set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT)
+        .set_dynamic_shape_enabled(true);  // must be set true for dynamic shape
+
+    auto A = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("A")
+                               .set_uid(A_UID)
+                               .set_dim({matmul_cache_shape.b, matmul_cache_shape.m, matmul_cache_shape.k})
+                               .set_stride({matmul_cache_shape.m * matmul_cache_shape.k, matmul_cache_shape.k, 1})
+                               .set_data_type(fe::DataType_t::BFLOAT16));
+
+    auto B = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("B")
+                               .set_uid(B_UID)
+                               .set_dim({matmul_cache_shape.b, matmul_cache_shape.k, matmul_cache_shape.n})
+                               .set_stride({matmul_cache_shape.n * matmul_cache_shape.k, 1, matmul_cache_shape.k})
+                               .set_data_type(fe::DataType_t::BFLOAT16));
+
+    auto C = graph->matmul(A, B, fe::graph::Matmul_attributes().set_compute_data_type(fe::DataType_t::FLOAT));
+    C->set_uid(C_UID).set_output(true).set_data_type(fe::DataType_t::BFLOAT16);
+
+    // For dynamic shape, recommend to query fallback plan to get a general good performance
+    // Heuristics Mode A is recommended if the dynamic problem shapes are similar in size
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::FALLBACK}).is_good());
+
+    // run graph with dynamic shapes
+    for (int idx_shape = 0; idx_shape < matmul_dynamic_shape_count; ++idx_shape) {
+        std::vector<int64_t> override_uids                = {A_UID, B_UID, C_UID};
+        std::vector<std::vector<int64_t>> override_shapes = {
+            {matmul_dynamic_shape[idx_shape].b, matmul_dynamic_shape[idx_shape].m, matmul_dynamic_shape[idx_shape].k},
+            {matmul_dynamic_shape[idx_shape].b, matmul_dynamic_shape[idx_shape].k, matmul_dynamic_shape[idx_shape].n},
+            {matmul_dynamic_shape[idx_shape].b, matmul_dynamic_shape[idx_shape].m, matmul_dynamic_shape[idx_shape].n}};
+        std::vector<std::vector<int64_t>> override_strides = {
+            {matmul_dynamic_shape[idx_shape].m * matmul_dynamic_shape[idx_shape].k,
+             matmul_dynamic_shape[idx_shape].k,
+             1},
+            {matmul_dynamic_shape[idx_shape].n * matmul_dynamic_shape[idx_shape].k,
+             1,
+             matmul_dynamic_shape[idx_shape].k},
+            {matmul_dynamic_shape[idx_shape].m * matmul_dynamic_shape[idx_shape].n,
+             matmul_dynamic_shape[idx_shape].n,
+             1}};
+
+        Surface<half> A_gpu(
+            matmul_dynamic_shape[idx_shape].b * matmul_dynamic_shape[idx_shape].m * matmul_dynamic_shape[idx_shape].k,
+            false);
+        Surface<half> B_gpu(
+            matmul_dynamic_shape[idx_shape].b * matmul_dynamic_shape[idx_shape].k * matmul_dynamic_shape[idx_shape].n,
+            false);
+        Surface<half> C_gpu(
+            matmul_dynamic_shape[idx_shape].b * matmul_dynamic_shape[idx_shape].m * matmul_dynamic_shape[idx_shape].n,
+            false);
+
+        std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+            {A_UID, A_gpu.devPtr}, {B_UID, B_gpu.devPtr}, {C_UID, C_gpu.devPtr}};
+
+        int64_t workspace_size = 0;
+        REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+        Surface<int8_t> workspace(workspace_size, false);
+
+        REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr, override_uids, override_shapes, override_strides)
+                    .is_good());
+
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/matmul/mixed_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/matmul/mixed_matmul.cpp
new file mode 100644
index 00000000..b46cad98
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/matmul/mixed_matmul.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<int8_t> A_gpu(b * m * k, false);
+    // note this is a bf16 tensor, but half is used just for memory allocation
+    Surface<half> B_gpu(b * k * n, false);
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::INT8);
+    auto A            = graph.tensor(A_attributes);
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, n, 1})
+                            .set_data_type(fe::DataType_t::BFLOAT16);
+    auto B = graph.tensor(B_attributes);
+
+    // Cast the input tensors to required mma precision
+    auto identity_attributes = fe::graph::Pointwise_attributes()
+                                   .set_name("Cast_A")
+                                   .set_mode(fe::PointwiseMode_t::IDENTITY)
+                                   // INT8->FLOAT->BF16 to maintain precision
+                                   .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto A_casted = graph.pointwise(A, identity_attributes);
+    A_casted->set_data_type(fe::DataType_t::BFLOAT16);
+
+    auto matmul_attributes =
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.matmul(A_casted, B, matmul_attributes);
+    C->set_output(true).set_data_type(fe::DataType_t::BFLOAT16);
+
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    if ((is_hopper_arch() || is_blackwell_arch()) && cudnnGetVersion() >= 8906) {
+        REQUIRE(graph.check_support().is_good());
+    } else {
+        SKIP("int8_bf16 mixed precision gemm not supported pre-Hopper or pre-cudnn-8.9.6");
+    }
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    //// Run cudnn graph
+    // note this is a bf16 tensor, but half is used just for memory allocation
+    Surface<half> C_gpu(b * m * n, false);
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {B, B_gpu.devPtr}, {C, C_gpu.devPtr}};
+
+    std::cout << graph.print() << std::endl;
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/autotuning.cpp b/third_party/cudnn-frontend/samples/cpp/misc/autotuning.cpp
new file mode 100644
index 00000000..82a693de
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/autotuning.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Matmul autotuning", "[matmul][graph][autotuning]") {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+    Surface<half> C_gpu(b * m * n, false);
+
+    int64_t a_uid = 0, b_uid = 1, c_uid = 2;
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto create_graph = [&]() -> fe::graph::Graph {
+        // Make cudnn graph
+        fe::graph::Graph graph{};
+
+        // Create the two non-virtual input tensors A and B.
+        // There are read from global memory.
+        auto A_attributes = fe::graph::Tensor_attributes()
+                                .set_name("A")
+                                .set_dim({b, m, k})
+                                .set_stride({m * k, k, 1})
+                                .set_uid(a_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto A            = graph.tensor(A_attributes);
+        auto B_attributes = fe::graph::Tensor_attributes()
+                                .set_name("B")
+                                .set_dim({b, k, n})
+                                .set_stride({k * n, n, 1})
+                                .set_uid(b_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto B = graph.tensor(B_attributes);
+
+        auto matmul_attributes =
+            fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+        auto C = graph.matmul(A, B, matmul_attributes);
+        C->set_output(true).set_uid(c_uid).set_data_type(fe::DataType_t::BFLOAT16);
+
+        REQUIRE(graph.validate().is_good());
+
+        REQUIRE(graph.build_operation_graph(handle).is_good());
+
+        graph.deselect_workspace_greater_than(0);
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        graph.deselect_workspace_greater_than(1024 * 1024);
+
+        REQUIRE(graph.check_support().is_good());
+
+        return graph;
+    };
+
+    auto graph = create_graph();
+
+    auto plan_count = graph.get_execution_plan_count();
+    std::cout << "Graph has " << plan_count << " plan candidates." << std::endl;
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {a_uid, A_gpu.devPtr}, {b_uid, B_gpu.devPtr}, {c_uid, C_gpu.devPtr}};
+
+    auto autotune = [&]() -> int64_t {
+        const int iter_count = 10;
+        cudaEvent_t start, stop;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaDeviceSynchronize();
+
+        cudaStream_t stream = nullptr;
+        cudnnGetStream(handle, &stream);
+
+        std::vector<float> execution_times;
+        execution_times.resize(plan_count, 10.0f);  // Some arbitrary high time
+
+        int64_t workspace_size = 0;
+        for (auto i = 0; i < plan_count; i++) {
+            workspace_size = std::max(workspace_size, graph.get_workspace_size_plan_at_index(i));
+        }
+
+        Surface<int8_t> workspace(workspace_size, false);
+
+        for (auto i = 0; i < plan_count; i++) {
+            float time_ms = 0.0f;
+
+            auto warmup_status = graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, i);
+
+            if (warmup_status.is_bad()) {
+                std::cout << "Plan at index " << i << " failed execution " << warmup_status.get_message() << std::endl;
+                continue;
+            }
+            cudaDeviceSynchronize();
+
+            cudaEventRecord(start, stream);
+            for (int iter = 0; iter < iter_count; iter++) {
+                auto status = graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, i);
+                (void)status;
+            }
+            cudaEventRecord(stop, stream);
+            cudaEventSynchronize(stop);
+            cudaEventElapsedTime(&time_ms, start, stop);
+
+            std::cout << "Plan at index " << i << " took " << time_ms / iter_count << " ms." << std::endl;
+            execution_times[i] = time_ms / iter_count;
+        }
+
+        return std::distance(std::begin(execution_times),
+                             std::min_element(std::begin(execution_times), std::end(execution_times)));
+    };
+    // Run cudnn graph
+
+    auto candidate_index = autotune();
+
+    std::string name;
+    REQUIRE(graph.get_plan_name_at_index(candidate_index, name).is_good());
+    std::cout << "Successful candidate " << name << " is at index " << candidate_index << std::endl;
+
+    REQUIRE(graph.build_plan_at_index(candidate_index).is_good());
+
+    Surface<int8_t> workspace(graph.get_workspace_size_plan_at_index(candidate_index), false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/cudagraphs.cpp b/third_party/cudnn-frontend/samples/cpp/misc/cudagraphs.cpp
new file mode 100644
index 00000000..2bf48148
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/cudagraphs.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../utils/helpers.h"
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+/*
+Run this example by using command:
+bin/samples "Cuda graphs with matmul add"
+
+This example shows how to construct a CUDA graph using cuDNN's
+native CUDA graph API (as opposed to using CUDA graph capture),
+using matmul add as the example operation.
+
+In this example, the constructed CUDA graph is embedded as a
+child of a larger CUDA graph (as we expect many users and
+frameworks will want to do).
+
+For a different example showing how to construct a CUDA graph and
+execute it by itself, see ../sdpa/fp16_fwd_with_cudagraphs.cpp.
+*/
+
+#define A_UID 0
+#define B_UID 1
+#define C_UID 2
+#define D_UID 3
+
+std::shared_ptr<cudnn_frontend::graph::Graph>
+create_graph(int64_t b, int64_t m, int64_t n, int64_t k, float scale_value) {
+    //// Create the cudnn graph
+    auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+    graph->set_io_data_type(cudnn_frontend::DataType_t::HALF)
+        .set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT)
+        .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
+
+    auto A = graph->tensor(
+        cudnn_frontend::graph::Tensor_attributes().set_dim({b, m, k}).set_stride({m * k, k, 1}).set_uid(A_UID));
+
+    auto scale_options = cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::MUL);
+    auto S             = graph->pointwise(A, graph->tensor(scale_value), scale_options);
+    S->set_data_type(cudnn_frontend::DataType_t::HALF);
+
+    auto B = graph->tensor(
+        cudnn_frontend::graph::Tensor_attributes().set_dim({b, k, n}).set_stride({n * k, n, 1}).set_uid(B_UID));
+    auto T = graph->matmul(S, B, cudnn_frontend::graph::Matmul_attributes());
+
+    auto C           = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                               .set_dim({1, 1, 1})
+                               .set_stride({1, 1, 1})
+                               .set_is_pass_by_value(true)
+                               .set_uid(C_UID));
+    auto add_options = cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::ADD);
+    auto D           = graph->pointwise(T, C, add_options);
+    D->set_output(true).set_uid(D_UID);
+    return graph;
+}
+
+TEST_CASE("Cuda graphs with matmul add", "[cudagraph][graph]") {
+    // cuDNN only supports native CUDA graphs in CUDA 12.0 and above.
+    // Because the below test depends on some CUDA graph APIs that changed
+    // between CUDA 11.x and 12.0, it wouldn't even compile in <12.0 anyway,
+    // so we just disable the whole test by #if in that case.
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires cuda toolkit 12.0 or above");
+#else
+    // Also check the CUDA version at runtime, for good measure.
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    //// Main graph
+    // This example shows how to add a cudnn cuda graph to an already existing
+    // cuda graph.
+    cudaGraph_t main_cuda_graph;
+    cudaGraphCreate(&main_cuda_graph, 0);
+
+    // Create any FE graph that you want to create a cuda graph for
+    int64_t b = 8, m = 32, n = 16, k = 8;
+    float scale_value = .5f;
+    auto graph        = create_graph(b, m, n, k, scale_value);
+
+    // Create the execution plan, as that is needed to populate cuda graph with
+    // cudnn kernels
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Validate the graph and lower the FE graph to BE graph
+    REQUIRE(graph->validate().is_good());
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+    // Make sure the selected execution plan supports cuda graph
+    graph->select_behavior_notes({cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
+    auto status = graph->check_support();
+    if (cudnn_frontend::detail::get_backend_version() >= 90500) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.is_bad());
+        SKIP(
+            "cudnn versions earlier than 9.5 don't support behavior note of "
+            "SUPPORTS_CUDA_GRAPH_NATIVE_API.");
+    }
+
+    //// Test code
+    // Does not necessarily need to be included in user code, in case you are
+    // referring to this sample for your usecase.
+    // START
+    std::vector<cudnn_frontend::BehaviorNote_t> notes;
+    status = graph->get_behavior_notes(notes);
+    REQUIRE(status.is_bad());  // expected to fail as no candidate has been set yet
+
+    notes.clear();
+    status = graph->get_behavior_notes_for_plan_at_index(0, notes);
+    REQUIRE(status.is_good());
+    // Make sure that the note is SUPPORTS_CUDA_GRAPH
+    bool supports_cuda_graph_native_api = false;
+    for (auto note : notes) {
+        if (note == cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API) {
+            supports_cuda_graph_native_api = true;
+        }
+    }
+    REQUIRE(supports_cuda_graph_native_api);
+    // END
+
+    REQUIRE(graph->build_plans().is_good());
+
+    //// Test code
+    // Does not necessarily need to be included in user code, in case you are
+    // referring to this sample for your usecase.
+    // START
+    notes.clear();
+    status = graph->get_behavior_notes(notes);
+    REQUIRE(status.is_good());  // expected to pass now as candidate has been set
+
+    // Make sure that the note is SUPPORTS_CUDA_GRAPH
+    supports_cuda_graph_native_api = false;
+    for (auto note : notes) {
+        if (note == cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API) {
+            supports_cuda_graph_native_api = true;
+        }
+    }
+    REQUIRE(supports_cuda_graph_native_api);
+    // END
+
+    //// Populate an existing cuda graph with cudnn's cuda graph
+    cudaGraph_t cudnn_cuda_graph;
+
+    // Initialize the cudnn cuda graph.
+    // The responsibility to destroy is on the user.
+    cudaGraphCreate(&cudnn_cuda_graph, 0);  // 0 is just what the API says to pass
+
+    Surface<int8_t> workspace(graph->get_workspace_size(), false);
+
+    half starter_value = __float2half(1.f);
+    half bias_value    = __float2half(2.f);
+    Surface<half> a_gpu(b * m * k, false, starter_value);
+    Surface<half> b_gpu(b * k * n, false, starter_value);
+    Surface<half> d_gpu(b * m * n, false);
+    std::unordered_map<cudnn_frontend::graph::Tensor_attributes::uid_t, void *> variant_pack = {
+        {A_UID, a_gpu.devPtr}, {B_UID, b_gpu.devPtr}, {C_UID, &bias_value}, {D_UID, d_gpu.devPtr}};
+
+    REQUIRE(graph->populate_cuda_graph(handle, variant_pack, workspace.devPtr, cudnn_cuda_graph).is_good());
+
+    // Put cudnn's cuda graph into main graph
+    cudaGraphNode_t cudnn_node_in_main_graph;
+    cudaGraphAddChildGraphNode(&cudnn_node_in_main_graph,
+                               main_cuda_graph,
+                               NULL,
+                               0,
+                               cudnn_cuda_graph);  // Note that this clones cudnn_cuda_graph.
+
+    // It is safe to destroy cudnn_cuda_graph here.
+    cudaGraphDestroy(cudnn_cuda_graph);
+
+    //// Instantiate the main graph.
+    cudaGraphExec_t cuda_graph_exec;
+    cudaGraphInstantiate(&cuda_graph_exec, main_cuda_graph, NULL, NULL, 0);
+
+    cudaGraphLaunch(cuda_graph_exec, 0);
+
+    //// Functional correctness
+    CUDA_CHECK(cudaDeviceSynchronize());
+    CUDA_CHECK(
+        cudaMemcpy(d_gpu.hostPtr, d_gpu.devPtr, sizeof(d_gpu.hostPtr[0]) * d_gpu.n_elems, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    for (int i = 0; i < d_gpu.n_elems; i++) {
+        REQUIRE(__half2float(d_gpu.hostPtr[i]) ==
+                scale_value * k * __half2float(starter_value) + __half2float(bias_value));
+    }
+
+    //// Update the instantiated cuda graph with new device pointers
+    Surface<int8_t> workspace_new(graph->get_workspace_size(), false);
+
+    half starter_value_new = __float2half(1.f);
+    half bias_value_new    = __float2half(1.f);
+    Surface<half> a_gpu_new(b * m * k, false, starter_value_new);
+    Surface<half> b_gpu_new(b * k * n, false, starter_value_new);
+    Surface<half> d_gpu_new(b * m * n, false);
+    std::unordered_map<cudnn_frontend::graph::Tensor_attributes::uid_t, void *> variant_pack_new = {
+        {A_UID, a_gpu_new.devPtr}, {B_UID, b_gpu_new.devPtr}, {C_UID, &bias_value_new}, {D_UID, d_gpu_new.devPtr}};
+
+    // This needs a cudnn cuda graph, which we can query from the cudnn_node in
+    // the main graph
+    cudaGraph_t cudnn_cuda_graph_new;
+    cudaGraphChildGraphNodeGetGraph(cudnn_node_in_main_graph, &cudnn_cuda_graph_new);
+
+    REQUIRE(graph->update_cuda_graph(handle, variant_pack_new, workspace_new.devPtr, cudnn_cuda_graph_new).is_good());
+
+    cudaGraphExecChildGraphNodeSetParams(cuda_graph_exec, cudnn_node_in_main_graph, cudnn_cuda_graph_new);
+
+    cudaGraphLaunch(cuda_graph_exec, 0);
+
+    //// Functional correctness
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaMemcpy(
+        d_gpu_new.hostPtr, d_gpu_new.devPtr, sizeof(d_gpu_new.hostPtr[0]) * d_gpu_new.n_elems, cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+
+    for (int i = 0; i < d_gpu_new.n_elems; i++) {
+        REQUIRE(__half2float(d_gpu_new.hostPtr[i]) ==
+                (scale_value * k * __half2float(starter_value_new) + __half2float(bias_value_new)));
+    }
+
+    //// Cleanup
+    cudaGraphExecDestroy(cuda_graph_exec);
+    cudaGraphDestroy(main_cuda_graph);
+    cudaGraphDestroy(cudnn_cuda_graph_new);
+#endif  // CUDART_VERSION < 12000
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/custom_plan.cpp b/third_party/cudnn-frontend/samples/cpp/misc/custom_plan.cpp
new file mode 100644
index 00000000..0d02acca
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/custom_plan.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Matmul custom plan", "[matmul][graph][autotuning]") {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+
+    if (cudnnGetVersion() < 90300) {
+        SKIP("Test requires cudnn 9.3.0 or above");
+        return;
+    }
+
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+    Surface<half> C_gpu(b * m * n, false);
+
+    int64_t a_uid = 0, b_uid = 1, c_uid = 2;
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto create_graph = [&]() -> fe::graph::Graph {
+        // Make cudnn graph
+        fe::graph::Graph graph{};
+
+        // Create the two non-virtual input tensors A and B.
+        // There are read from global memory.
+        auto A_attributes = fe::graph::Tensor_attributes()
+                                .set_name("A")
+                                .set_dim({b, m, k})
+                                .set_stride({m * k, k, 1})
+                                .set_uid(a_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto A            = graph.tensor(A_attributes);
+        auto B_attributes = fe::graph::Tensor_attributes()
+                                .set_name("B")
+                                .set_dim({b, k, n})
+                                .set_stride({k * n, n, 1})
+                                .set_uid(b_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto B = graph.tensor(B_attributes);
+
+        auto matmul_attributes =
+            fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+        auto C = graph.matmul(A, B, matmul_attributes);
+        C->set_output(true).set_uid(c_uid).set_data_type(fe::DataType_t::BFLOAT16);
+
+        REQUIRE(graph.validate().is_good());
+
+        REQUIRE(graph.build_operation_graph(handle).is_good());
+
+        return graph;
+    };
+
+    auto graph = create_graph();
+
+    int64_t engine_count;
+    REQUIRE(graph.get_engine_count(engine_count).is_good());
+
+    for (int64_t id = 0; id < engine_count; id++) {
+        std::vector<fe::Knob> knobs;
+
+        // It might happen that an engine is not supported.
+        auto status = graph.get_knobs_for_engine(id, knobs);
+        if (status.get_code() != fe::error_code_t::OK) {
+            continue;
+        }
+
+        std::unordered_map<fe::KnobType_t, int64_t> knob_map;
+        for (auto &knob : knobs) {
+            knob_map[knob.type] = knob.minValue + knob.stride;
+        }
+
+        // It might happen that the knobs are not supported.
+        status = graph.create_execution_plan(id, knob_map);
+        if (status.get_code() != fe::error_code_t::OK) {
+            continue;
+        }
+    }
+
+    REQUIRE(graph.check_support().is_good());
+
+    auto plan_count = graph.get_execution_plan_count();
+    std::cout << "Graph has " << plan_count << " plan candidates." << std::endl;
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {a_uid, A_gpu.devPtr}, {b_uid, B_gpu.devPtr}, {c_uid, C_gpu.devPtr}};
+
+    Surface<int8_t> workspace1(graph.get_workspace_size_plan_at_index(0), false);
+    REQUIRE(graph.execute_plan_at_index(handle, variant_pack, workspace1.devPtr, 0).is_good());
+
+    if (plan_count > 1) {
+        Surface<int8_t> workspace2(graph.get_workspace_size_plan_at_index(1), false);
+        REQUIRE(graph.execute_plan_at_index(handle, variant_pack, workspace2.devPtr, 1).is_good());
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/deviceless_aot_compilation.cpp b/third_party/cudnn-frontend/samples/cpp/misc/deviceless_aot_compilation.cpp
new file mode 100644
index 00000000..3c387197
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/deviceless_aot_compilation.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Deviceless compilation", "[conv][graph][serialization]") {
+#if (CUDNN_VERSION < 91100)
+    SKIP("Device property serialization requires cudnn 9.11.0 and up.");
+#endif
+
+    if (!is_arch_supported_by_cudnn()) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+
+    namespace fe = cudnn_frontend;
+
+    //////////////////////////////////////////////////////////////
+    // 1. serialize device properties
+    //////////////////////////////////////////////////////////////
+    auto device_prop = std::make_shared<fe::DeviceProperties>();
+    REQUIRE(device_prop->set_device_id(0).build().is_good());
+
+    std::vector<uint8_t> data_device_prop;
+    REQUIRE(device_prop->serialize(data_device_prop).is_good());
+
+    //////////////////////////////////////////////////////////////
+    // 2. Deviceless ahead-of-time compilation, should be done on CPU nodes actually
+    // -- deserialize the device properties and create a conv graph with it
+    // -- build an execution plan (via querying heuristics)
+    // -- serialize the plan
+    //////////////////////////////////////////////////////////////
+    auto device_prop_deserialized = std::make_shared<fe::DeviceProperties>();
+    REQUIRE(device_prop_deserialized->deserialize(data_device_prop).is_good());
+
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_device_properties(device_prop_deserialized)
+        .set_io_data_type(fe::DataType_t::HALF)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 3, s = 3;
+    auto X = graph->tensor(
+        fe::graph::Tensor_attributes().set_name("image").set_dim({n, c, h, w}).set_stride({c * h * w, 1, c * w, c}));
+    auto W = graph->tensor(
+        fe::graph::Tensor_attributes().set_name("filter").set_dim({k, c, r, s}).set_stride({c * r * s, 1, c * s, c}));
+    auto conv_options = fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto Y            = graph->conv_fprop(X, W, conv_options);
+    Y->set_output(true);
+
+    REQUIRE(graph->build({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    std::vector<uint8_t> data_graph;
+    REQUIRE(graph->serialize(data_graph).is_good());
+
+    //////////////////////////////////////////////////////////////
+    // 3. deserialize and execute the plan
+    //////////////////////////////////////////////////////////////
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph_deserialized = std::make_shared<fe::graph::Graph>();
+    REQUIRE(graph_deserialized->deserialize(handle, data_graph).is_good());
+
+    Surface<half> x_tensor(n * c * h * w, false);
+    Surface<half> w_tensor(k * c * r * s, false);
+    Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+
+    std::unordered_map<int64_t, void*> variant_pack = {
+        {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph_deserialized->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::cout << *graph_deserialized << std::endl;
+
+    REQUIRE(graph_deserialized->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/parallel_compilation.cpp b/third_party/cudnn-frontend/samples/cpp/misc/parallel_compilation.cpp
new file mode 100644
index 00000000..af0dbc97
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/parallel_compilation.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/generators/catch_generators.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <thread>
+
+TEST_CASE("Parallel build", "[matmul][graph][parallel]") {
+    SKIP(
+        "Very long test turned off by default. Run /bin/samples --benchmark-samples 1  \"Parallel build\" after "
+        "uncommenting this line.");
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+    Surface<half> C_gpu(b * m * n, false);
+
+    int64_t a_uid = 0, b_uid = 1, c_uid = 2;
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto create_graph = [&]() -> fe::graph::Graph {
+        // Make cudnn graph
+        fe::graph::Graph graph{};
+
+        // Create the two non-virtual input tensors A and B.
+        // There are read from global memory.
+        auto A_attributes = fe::graph::Tensor_attributes()
+                                .set_name("A")
+                                .set_dim({b, m, k})
+                                .set_stride({m * k, k, 1})
+                                .set_uid(a_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto A            = graph.tensor(A_attributes);
+        auto B_attributes = fe::graph::Tensor_attributes()
+                                .set_name("B")
+                                .set_dim({b, k, n})
+                                .set_stride({k * n, n, 1})
+                                .set_uid(b_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto B = graph.tensor(B_attributes);
+
+        auto matmul_attributes =
+            fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+        auto C = graph.matmul(A, B, matmul_attributes);
+        C->set_output(true).set_uid(c_uid).set_data_type(fe::DataType_t::BFLOAT16);
+
+        REQUIRE(graph.validate().is_good());
+
+        REQUIRE(graph.build_operation_graph(handle).is_good());
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        graph.select_behavior_notes({fe::BehaviorNote_t::RUNTIME_COMPILATION});
+
+        REQUIRE(graph.check_support().is_good());
+
+        return graph;
+    };
+
+    auto build = [](fe::graph::Graph &graph, int index) { auto status = graph.build_plan_at_index(index); };
+
+    BENCHMARK("BuildPlanPolicy_t::HEURISTICS_CHOICE") {
+        fe::graph::Graph graph = create_graph();
+        return graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good();
+    };
+
+    BENCHMARK("BuildPlanPolicy_t::ALL") {
+        fe::graph::Graph graph = create_graph();
+        return graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good();
+    };
+
+    BENCHMARK("build_plan_at_index::ALL::serial") {
+        fe::graph::Graph graph = create_graph();
+        auto plan_count        = graph.get_execution_plan_count();
+        for (auto i = 0; i < plan_count; i++) {
+            build(graph, i);
+        }
+    };
+
+    BENCHMARK("build_plan_at_index::ALL::parallel") {
+        fe::graph::Graph graph = create_graph();
+        auto plan_count        = graph.get_execution_plan_count();
+        std::vector<std::thread> builders;
+        for (auto i = 0; i < plan_count; i++) {
+            builders.emplace_back(std::thread{build, std::reference_wrapper<fe::graph::Graph>(graph), i});
+        }
+        for (auto &builder : builders) {
+            builder.join();
+        }
+    };
+
+    {
+        auto input = GENERATE(range(2, 11));
+
+        BENCHMARK("build_plan_at_index::ALL::parallel_" + std::to_string(input)) {
+            fe::graph::Graph graph = create_graph();
+            auto plan_count = input < graph.get_execution_plan_count() ? input : graph.get_execution_plan_count();
+            std::vector<std::thread> builders;
+            for (auto i = 0; i < plan_count; i++) {
+                builders.emplace_back(std::thread{build, std::reference_wrapper<fe::graph::Graph>(graph), i});
+            }
+            for (auto &builder : builders) {
+                builder.join();
+            }
+        };
+    }
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/pointwise.cpp b/third_party/cudnn-frontend/samples/cpp/misc/pointwise.cpp
new file mode 100644
index 00000000..7d9e7fb9
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/pointwise.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Reduction", "[reduction]") {
+    namespace fe    = cudnn_frontend;
+    constexpr int n = 64;
+    if (cudnnGetVersion() < 8600) {
+        SKIP("TEST REQUIRES minimum cudnn version 8.6.0");
+    }
+    Surface<float> A_gpu(n * n * n * n, false);
+    fe::graph::Graph graph{};
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({n, n, n, n})
+                              .set_stride({n * n * n, 1, n * n, n})
+                              .set_data_type(fe::DataType_t::FLOAT));
+    auto C = graph.reduction(A,
+                             fe::graph::Reduction_attributes()
+                                 .set_mode(fe::ReductionMode_t::MAX)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT));
+    C->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1});
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+    Surface<float> C_gpu(n * n * n * n, false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, A_gpu.devPtr},
+                                                                                             {C, C_gpu.devPtr}};
+    int64_t workspace_size                                                                = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Fused scalar", "[scalar][graph]") {
+    namespace fe = cudnn_frontend;
+
+    constexpr int n = 4;
+
+    fe::graph::Graph graph{};
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({n, n, n})
+                              .set_stride({n * n, n, 1})
+                              .set_data_type(fe::DataType_t::HALF));
+    auto C = graph.pointwise(A,
+                             graph.tensor(5.0f),
+                             fe::graph::Pointwise_attributes()
+                                 .set_mode(fe::PointwiseMode_t::ADD)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT));
+    C->set_output(true).set_data_type(fe::DataType_t::HALF);
+
+    REQUIRE(graph.validate().is_good());
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<half> C_gpu(n * n * n, false);
+    Surface<half> A_gpu(n * n * n, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, A_gpu.devPtr},
+                                                                                             {C, C_gpu.devPtr}};
+    int64_t workspace_size                                                                = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Fused Amax Reduction and type conversion", "[reduction]") {
+    namespace fe    = cudnn_frontend;
+    constexpr int n = 64;
+
+    if (cudnnGetVersion() < 8600) {
+        SKIP("TEST REQUIRES minimum cudnn version 8.6.0");
+    }
+
+    if (check_device_arch_newer_than("hopper") == false) {
+        SKIP("TEST REQUIRES device  hopper arch or newer");
+    }
+
+    fe::graph::Graph graph{};
+
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({n, n, n, n})
+                              .set_stride({n * n * n, 1, n * n, n})
+                              .set_data_type(fe::DataType_t::FLOAT));
+
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_dim({1, 1, 1, 1})
+                                  .set_stride({1, 1, 1, 1})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    auto amax = graph.reduction(A,
+                                fe::graph::Reduction_attributes()
+                                    .set_mode(fe::ReductionMode_t::AMAX)
+                                    .set_compute_data_type(fe::DataType_t::FLOAT));
+
+    amax->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1});
+
+    auto scale_options = fe::graph::Pointwise_attributes()
+                             .set_mode(fe::PointwiseMode_t::MUL)
+                             .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.pointwise(A, scale, scale_options);
+    C->set_output(true).set_data_type(fe::DataType_t::FP8_E4M3);
+
+    REQUIRE(graph.validate().is_good());
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<float> A_gpu(n * n * n * n, false);
+    Surface<float> scale_gpu(1, false);
+    Surface<float> amax_gpu(1, false);
+    Surface<int8_t> C_gpu(n * n * n * n, false);  // Substitute for fp8
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {scale, scale_gpu.devPtr}, {amax, amax_gpu.devPtr}, {C, C_gpu.devPtr}};
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/resample.cpp b/third_party/cudnn-frontend/samples/cpp/misc/resample.cpp
new file mode 100644
index 00000000..9f14933b
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/resample.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Resample Max Pooling NHWC Inference", "[resample][pooling][max][graph]") {
+    namespace fe = cudnn_frontend;
+
+    // This example shows running max pooling graphs when in inference mode.
+    // See details about support surface in
+    // https://docs.nvidia.com/deeplearning/cudnn/developer/graph-api.html#resamplefwd
+
+    constexpr int N = 8;
+    constexpr int H = 56;
+    constexpr int W = 56;
+    constexpr int C = 8;
+
+    fe::graph::Graph graph{};
+
+    graph.set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes().set_dim({N, C, H, W}).set_stride({H * W * C, 1, W * C, C}));
+
+    auto [Y, Index] = graph.resample(X,
+                                     fe::graph::Resample_attributes()
+                                         .set_generate_index(false)
+                                         .set_resampling_mode(fe::ResampleMode_t::MAXPOOL)
+                                         .set_padding_mode(fe::PaddingMode_t::NEG_INF_PAD)
+                                         .set_window({2, 3})
+                                         .set_stride({4, 5})
+                                         .set_pre_padding({2, 3})
+                                         .set_post_padding({4, 5}));
+
+    Y->set_output(true);
+    assert(Index == nullptr);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.check_support().is_good());
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<half> X_gpu(N * H * W * C, false);
+    Surface<half> Y_gpu(N * H * W * C, false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{X, X_gpu.devPtr},
+                                                                                             {Y, Y_gpu.devPtr}};
+    int64_t workspace_size                                                                = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Resample Max Pooling NHWC Training", "[resample][pooling][max][graph]") {
+    namespace fe = cudnn_frontend;
+
+    // This example shows running NHWC max pooling graphs.
+    // Support for NHWC max pooling has a fast path which can dump index tensor from forward pass.
+    // This mean backward pass to skip reading full X tensor and instead just use this index tensor.
+    // See details about support surface and index tensor in
+    // https://docs.nvidia.com/deeplearning/cudnn/developer/graph-api.html#resamplefwd
+
+    constexpr int N = 8;
+    constexpr int H = 56;
+    constexpr int W = 56;
+    constexpr int C = 8;
+
+    fe::graph::Graph graph{};
+
+    graph.set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes().set_dim({N, C, H, W}).set_stride({H * W * C, 1, W * C, C}));
+
+    auto [Y, Index] = graph.resample(X,
+                                     fe::graph::Resample_attributes()
+                                         .set_generate_index(true)
+                                         .set_resampling_mode(fe::ResampleMode_t::MAXPOOL)
+                                         .set_padding_mode(fe::PaddingMode_t::NEG_INF_PAD)
+                                         .set_window({2, 3})
+                                         .set_stride({4, 5})
+                                         .set_pre_padding({2, 3})
+                                         .set_post_padding({4, 5}));
+
+    Y->set_output(true);
+    Index->set_output(true).set_data_type(fe::DataType_t::INT8);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    auto const status = graph.build_operation_graph(handle);
+    if (cudnn_frontend::detail::get_backend_version() >= 8600)
+        REQUIRE(status.is_good());
+    else {
+        REQUIRE(status.is_bad());
+        SKIP("Using index tensor is not supported pre 8.6.");
+    }
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.check_support().is_good());
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<half> X_gpu(N * H * W * C, false);
+    Surface<half> Y_gpu(N * H * W * C, false);
+    Surface<int8_t> Index_gpu(N * H * W * C / 8, false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_gpu.devPtr}, {Y, Y_gpu.devPtr}, {Index, Index_gpu.devPtr}};
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Resample Avg Pooling", "[resample][pooling][average][graph]") {
+    namespace fe = cudnn_frontend;
+
+    // This example shows running average pooling graphs.
+    // There is no difference between NHWC and NCHW support surface.
+    // See details about support surface in
+    // https://docs.nvidia.com/deeplearning/cudnn/developer/graph-api.html#resamplefwd
+
+    constexpr int N = 8;
+    constexpr int H = 56;
+    constexpr int W = 56;
+    constexpr int C = 8;
+
+    fe::graph::Graph graph{};
+
+    graph.set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes().set_dim({N, C, H, W}).set_stride({H * W * C, 1, W * C, C}));
+
+    auto [Y, Index] = graph.resample(X,
+                                     fe::graph::Resample_attributes()
+                                         .set_generate_index(true)
+                                         .set_resampling_mode(fe::ResampleMode_t::AVGPOOL_INCLUDE_PADDING)
+                                         .set_padding_mode(fe::PaddingMode_t::ZERO_PAD)
+                                         .set_window({2, 3})
+                                         .set_stride({4, 5})
+                                         .set_pre_padding({2, 3})
+                                         .set_post_padding({4, 5}));
+
+    Y->set_output(true);
+    assert(Index == nullptr);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.check_support().is_good());
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<half> X_gpu(N * H * W * C, false);
+    Surface<half> Y_gpu(N * H * W * C, false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{X, X_gpu.devPtr},
+                                                                                             {Y, Y_gpu.devPtr}};
+    int64_t workspace_size                                                                = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/serialization.cpp b/third_party/cudnn-frontend/samples/cpp/misc/serialization.cpp
new file mode 100644
index 00000000..f81cc1ae
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/serialization.cpp
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("CSBR Graph with serialization", "[conv][graph][serialization]") {
+    enum UIDs {
+        x_tensor,
+        w_tensor,
+        scale_tensor,
+        bias_tensor,
+        y_tensor,
+    };
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("Serialization tests is not supported in cudnn versions prior to 8.9.5");
+#endif
+
+    int64_t n = 8, c = 32, h = 16, w = 16, k = 64, r = 3, s = 3;
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto build_and_validate_graph_helper =
+        [](int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s)
+        -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+        graph->set_io_data_type(cudnn_frontend::DataType_t::HALF)
+            .set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT)
+            .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
+
+        auto X = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_name("image")
+                                   .set_uid(x_tensor)
+                                   .set_dim({n, c, h, w})
+                                   .set_stride({c * h * w, 1, c * w, c}));
+
+        auto W = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_uid(w_tensor)
+                                   .set_name("filter")
+                                   .set_dim({k, c, r, s})
+                                   .set_stride({c * r * s, 1, c * s, c}));
+
+        auto conv_options =
+            cudnn_frontend::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+        auto conv_output = graph->conv_fprop(X, W, conv_options);
+        conv_output->set_name("conv_output");
+
+        auto S = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_uid(scale_tensor)
+                                   .set_name("scale")
+                                   .set_dim({1, k, 1, 1})
+                                   .set_stride({k, 1, k, k}));
+        auto scale_options =
+            cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::MUL);
+        auto scale_output = graph->pointwise(conv_output, S, scale_options);
+        scale_output->set_name("scale_output");
+
+        auto B = graph->tensor(cudnn_frontend::graph::Tensor_attributes()
+                                   .set_name("bias")
+                                   .set_uid(bias_tensor)
+                                   .set_dim({1, k, 1, 1})
+                                   .set_stride({k, 1, k, k}));
+        auto bias_options =
+            cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::ADD);
+        auto bias_output = graph->pointwise(scale_output, B, bias_options);
+        bias_output->set_name("bias_output");
+
+        auto relu_options =
+            cudnn_frontend::graph::Pointwise_attributes().set_mode(cudnn_frontend::PointwiseMode_t::RELU_FWD);
+        auto Y = graph->pointwise(bias_output, relu_options);
+        Y->set_output(true).set_uid(y_tensor).set_name("final_output");
+
+        REQUIRE(graph->validate().is_good());
+
+        return graph;
+    };
+
+    // Check support
+
+    auto check_support = [build_and_validate_graph_helper](
+                             int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) -> bool {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle     = *handle_ptr;
+
+        auto graph = build_and_validate_graph_helper(n, c, h, w, k, r, s);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        return true;
+    };
+
+    // Serialization Phase
+
+    auto serialize =
+        [build_and_validate_graph_helper](
+            int64_t n, int64_t c, int64_t h, int64_t w, int64_t k, int64_t r, int64_t s) -> std::vector<uint8_t> {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle     = *handle_ptr;
+        std::vector<uint8_t> serialized_data;
+
+        auto graph = build_and_validate_graph_helper(n, c, h, w, k, r, s);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        // Insert auto-tuning logic here
+
+        REQUIRE(graph->serialize(serialized_data).is_good());
+
+        return serialized_data;
+    };
+
+    auto deserialize = [](cudnnHandle_t handle,
+                          std::vector<uint8_t> const& data) -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+
+        REQUIRE(graph->deserialize(handle, data).is_good());
+
+        return graph;
+    };
+
+    // Check if the graph is supported
+    REQUIRE(check_support(n, c, h, w, k, r, s));
+
+    // Serialize the graph.
+    auto serialize_data = serialize(n, c, h, w, k, r, s);
+
+    // Deserialize the graph and execute
+    auto graph = deserialize(handle, serialize_data);
+
+    cudnn_frontend::graph::Tensor_attributes tensor_attr;
+    auto result = graph->query_tensor_attributes_of_uid(x_tensor, tensor_attr);
+    REQUIRE(result.is_good());
+
+    Surface<half> x_device_memory(n * c * h * w, false);
+    Surface<half> w_device_memory(k * c * r * s, false);
+    Surface<half> s_device_memory(k, false);
+    Surface<half> b_device_memory(k, false);
+    Surface<half> y_device_memory(n * k * h * w, false);  // Should be p, q.
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<int64_t, void*> variant_pack = {{x_tensor, x_device_memory.devPtr},
+                                                       {w_tensor, w_device_memory.devPtr},
+                                                       {scale_tensor, s_device_memory.devPtr},
+                                                       {bias_tensor, b_device_memory.devPtr},
+                                                       {y_tensor, y_device_memory.devPtr}};
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("SDPA Graph with serialization", "[sdpa][graph][serialization]") {
+    int64_t b    = 12;    // batch size
+    int64_t h    = 6;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("Serialization tests is not supported in cudnn versions prior to 8.9.5");
+#endif
+
+    // Mode of sdpa operation
+    bool generate_stats = false;
+
+    bool use_causal_mask = true;
+    bool use_alibi_mask  = true;
+
+    // attention scale
+    bool is_attn_scale   = true;
+    float attn_scale_cpu = 0.5f;
+
+    // Dropout configutation
+    bool use_dropout_with_rng = true;
+    float dropout_probability = 0.1f;
+
+    // switch off certain features on blackwell
+    if (is_blackwell_arch()) {
+        use_dropout_with_rng = false;
+        use_alibi_mask       = false;
+    }
+
+    enum UIDs { uid_Q, uid_K, uid_V, uid_ATTN_SCALE, uid_SEED, uid_OFFSET, uid_O, uid_STATS };
+
+    auto build_and_validate_graph_helper =
+        [](int64_t b,
+           int64_t h,
+           int64_t s_q,
+           int64_t s_kv,
+           int64_t d,
+           bool is_attn_scale,
+           bool generate_stats,
+           bool use_causal_mask,
+           bool use_alibi_mask,
+           bool use_dropout_with_rng,
+           float dropout_probability) -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        namespace fe = cudnn_frontend;
+
+        auto graph = std::make_shared<fe::graph::Graph>();
+
+        graph->set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Q")
+                                   .set_dim({b, h, s_q, d})
+                                   .set_uid(uid_Q)
+                                   .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+        auto K = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("K")
+                                   .set_uid(uid_K)
+                                   .set_dim({b, h, s_kv, d})
+                                   .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+        auto V = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("V")
+                                   .set_uid(uid_V)
+                                   .set_dim({b, h, s_kv, d})
+                                   .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+
+        auto attn_scale = is_attn_scale ? graph->tensor(fe::graph::Tensor_attributes()
+                                                            .set_name("attn_scale")
+                                                            .set_dim({1, 1, 1, 1})
+                                                            .set_uid(uid_ATTN_SCALE)
+                                                            .set_stride({1, 1, 1, 1})
+                                                            .set_is_pass_by_value(true)
+                                                            .set_data_type(fe::DataType_t::FLOAT))
+                                        : nullptr;
+
+        auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention").set_generate_stats(generate_stats);
+
+        if (use_causal_mask) {
+            sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+                .set_diagonal_band_right_bound(0);
+        }
+
+        sdpa_options.set_alibi_mask(use_alibi_mask);
+
+        if (is_attn_scale) {
+            sdpa_options.set_attn_scale(attn_scale);
+        };
+
+        auto seed = use_dropout_with_rng ? graph->tensor(fe::graph::Tensor_attributes()
+                                                             .set_name("Seed")
+                                                             .set_uid(uid_SEED)
+                                                             .set_dim({1, 1, 1, 1})
+                                                             .set_stride({1, 1, 1, 1})
+                                                             .set_data_type(fe::DataType_t::INT32))
+                                         : nullptr;
+
+        auto offset = use_dropout_with_rng ? graph->tensor(fe::graph::Tensor_attributes()
+                                                               .set_uid(uid_OFFSET)
+                                                               .set_name("Offset")
+                                                               .set_dim({1, 1, 1, 1})
+                                                               .set_stride({1, 1, 1, 1})
+                                                               .set_data_type(fe::DataType_t::INT32))
+                                           : nullptr;
+
+        if (use_dropout_with_rng) {
+            sdpa_options.set_dropout(dropout_probability, seed, offset);
+        }
+
+        auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+        O->set_output(true).set_dim({b, h, s_q, d}).set_uid(uid_O).set_stride({h * d, d, b * h * d, 1});
+
+        // Check that Stats tensor is real, which is only when its training step
+        if (generate_stats) {
+            stats->set_output(true).set_uid(uid_STATS).set_data_type(fe::DataType_t::FLOAT);
+        } else {
+            REQUIRE(stats == nullptr);
+        }
+
+        REQUIRE(graph->validate().is_good());
+
+        return graph;
+    };
+
+    auto check_support = [build_and_validate_graph_helper](int64_t b,
+                                                           int64_t h,
+                                                           int64_t s_q,
+                                                           int64_t s_kv,
+                                                           int64_t d,
+                                                           bool is_attn_scale,
+                                                           bool generate_stats,
+                                                           bool use_causal_mask,
+                                                           bool use_alibi_mask,
+                                                           bool use_dropout_with_rng,
+                                                           float dropout_probability) -> bool {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle     = *handle_ptr;
+
+        auto graph = build_and_validate_graph_helper(b,
+                                                     h,
+                                                     s_q,
+                                                     s_kv,
+                                                     d,
+                                                     is_attn_scale,
+                                                     generate_stats,
+                                                     use_causal_mask,
+                                                     use_alibi_mask,
+                                                     use_dropout_with_rng,
+                                                     dropout_probability);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        return true;
+    };
+
+    auto serialize = [build_and_validate_graph_helper](int64_t b,
+                                                       int64_t h,
+                                                       int64_t s_q,
+                                                       int64_t s_kv,
+                                                       int64_t d,
+                                                       bool is_attn_scale,
+                                                       bool generate_stats,
+                                                       bool use_causal_mask,
+                                                       bool use_alibi_mask,
+                                                       bool use_dropout_with_rng,
+                                                       float dropout_probability) -> std::vector<uint8_t> {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle     = *handle_ptr;
+        std::vector<uint8_t> serialized_data;
+
+        auto graph = build_and_validate_graph_helper(b,
+                                                     h,
+                                                     s_q,
+                                                     s_kv,
+                                                     d,
+                                                     is_attn_scale,
+                                                     generate_stats,
+                                                     use_causal_mask,
+                                                     use_alibi_mask,
+                                                     use_dropout_with_rng,
+                                                     dropout_probability);
+
+        REQUIRE(graph->build_operation_graph(handle).is_good());
+
+        REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph->check_support().is_good());
+
+        REQUIRE(graph->build_plans().is_good());
+
+        // Insert auto-tuning logic here
+
+        REQUIRE(graph->serialize(serialized_data).is_good());
+
+        return serialized_data;
+    };
+
+    auto deserialize = [](cudnnHandle_t handle,
+                          std::vector<uint8_t> const& data) -> std::shared_ptr<cudnn_frontend::graph::Graph> {
+        auto graph = std::make_shared<cudnn_frontend::graph::Graph>();
+
+        REQUIRE(graph->deserialize(handle, data).is_good());
+
+        return graph;
+    };
+
+    // Check support
+    REQUIRE(check_support(b,
+                          h,
+                          s_q,
+                          s_kv,
+                          d,
+                          is_attn_scale,
+                          generate_stats,
+                          use_causal_mask,
+                          use_alibi_mask,
+                          use_dropout_with_rng,
+                          dropout_probability));
+
+    // Serialize the graph.
+    auto serialize_data = serialize(b,
+                                    h,
+                                    s_q,
+                                    s_kv,
+                                    d,
+                                    is_attn_scale,
+                                    generate_stats,
+                                    use_causal_mask,
+                                    use_alibi_mask,
+                                    use_dropout_with_rng,
+                                    dropout_probability);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = deserialize(handle, serialize_data);
+
+    //// Build variant pack
+    Surface<half> qkvTensor(b * s_q * 3 * h * d, false);
+    Surface<half> oTensor(b * s_q * h * d, false);
+    void* devPtrQ = qkvTensor.devPtr;
+    void* devPtrK = (qkvTensor.devPtr + d);
+    void* devPtrV = (qkvTensor.devPtr + 2 * d);
+    void* devPtrO = oTensor.devPtr;
+
+    int32_t scaleSize  = 1;
+    int32_t seed_value = 123456;
+    Surface<int32_t> dropoutSeed(scaleSize, false, seed_value);
+    Surface<int32_t> dropoutOffset(scaleSize, false, (int32_t)1);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::cout << "Graph requires workspace " << workspace_size << std::endl;
+
+    std::unordered_map<int64_t, void*> variant_pack = {{uid_Q, devPtrQ},
+                                                       {uid_K, devPtrK},
+                                                       {uid_V, devPtrV},
+                                                       {uid_ATTN_SCALE, &attn_scale_cpu},
+                                                       {uid_SEED, dropoutSeed.devPtr},
+                                                       {uid_OFFSET, dropoutOffset.devPtr},
+                                                       {uid_O, devPtrO}};
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/slice.cpp b/third_party/cudnn-frontend/samples/cpp/misc/slice.cpp
new file mode 100644
index 00000000..feb63c9c
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/slice.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Slice gemm", "[slice][gemm][graph][fusion]") {
+    namespace fe = cudnn_frontend;
+
+    constexpr int B_start  = 1;
+    constexpr int B        = 8;
+    constexpr int B_end    = 2;
+    constexpr int B_actual = B_start + B + B_end;
+
+    constexpr int M_start  = 3;
+    constexpr int M        = 16;
+    constexpr int M_end    = 4;
+    constexpr int M_actual = M_start + M + M_end;
+
+    constexpr int N = 32;
+
+    constexpr int K = 64;
+
+    constexpr int a_uid = 1, b_uid = 2, c_uid = 3;
+
+    fe::graph::Graph graph{};
+
+    graph.set_io_data_type(fe::DataType_t::HALF).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({B_actual, M_actual, K})
+                              .set_stride({M_actual * K, K, 1})
+                              .set_uid(a_uid));
+
+    auto slice_params = fe::graph::Slice_attributes().set_name("slice").set_slices(
+        {{B_start, B_start + B}, {M_start, M_start + M}, {0, K}});
+    auto A_slice = graph.slice(A, slice_params);
+    A_slice->set_data_type(fe::DataType_t::HALF);
+
+    auto B0 = graph.tensor(fe::graph::Tensor_attributes().set_dim({B, K, N}).set_stride({K * N, N, 1}).set_uid(b_uid));
+
+    auto C0 = graph.matmul(A_slice, B0, fe::graph::Matmul_attributes().set_name("matmul"));
+    C0->set_data_type(fe::DataType_t::FLOAT);
+
+    auto C =
+        graph.pointwise(C0, fe::graph::Pointwise_attributes().set_name("relu").set_mode(fe::PointwiseMode_t::RELU_FWD));
+    C->set_output(true).set_uid(c_uid);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build(handle, {fe::HeurMode_t::A}).is_good());
+
+    std::vector<uint8_t> serialized_data;
+    REQUIRE(graph.serialize(serialized_data).is_good());
+
+    Surface<half> A_gpu(B_actual * M_actual * K, false);
+    Surface<half> B_gpu(B * K * N, false);
+    Surface<half> C_gpu(B * M * N, false);
+    std::unordered_map<int64_t, void *> variant_pack = {
+        {a_uid, A_gpu.devPtr}, {b_uid, B_gpu.devPtr}, {c_uid, C_gpu.devPtr}};
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    fe::graph::Graph graph2;
+    REQUIRE(graph2.deserialize(handle, serialized_data).is_good());
+    auto result = graph2.execute(handle, variant_pack, workspace.devPtr);
+    if (!result.is_good()) {
+        std::cerr << result.get_message();
+        REQUIRE(false);
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/misc/sm_carveout.cpp b/third_party/cudnn-frontend/samples/cpp/misc/sm_carveout.cpp
new file mode 100644
index 00000000..79fe719b
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/misc/sm_carveout.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("SGBN with SM carveout", "[batchnorm][graph][sm_carveout]") {
+    if (cudnnGetVersion() < 90300) {
+        SKIP("SM carveout on batchnorm not supported pre-cudnn-9.3.0");
+    }
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT)
+        .set_sm_count(8);
+
+    auto n = 8, c = 32, h = 16, w = 16;
+    auto X = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("X").set_dim({n, c, h, w}).set_stride({c * h * w, 1, c * w, c}));
+    auto prev_running_mean = graph.tensor(fe::graph::Tensor_attributes()
+                                              .set_name("prev_running_mean")
+                                              .set_dim({1, c, 1, 1})
+                                              .set_stride({c, 1, c, c})
+                                              .set_data_type(fe::DataType_t::FLOAT));
+    auto prev_running_var  = graph.tensor(fe::graph::Tensor_attributes()
+                                             .set_name("prev_running_var")
+                                             .set_dim({1, c, 1, 1})
+                                             .set_stride({c, 1, c, c})
+                                             .set_data_type(fe::DataType_t::FLOAT));
+    auto scale             = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, c, 1, 1})
+                                  .set_stride({c, 1, c, c})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias              = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, c, 1, 1})
+                                 .set_stride({c, 1, c, c})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    auto peer_stats_0 = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_dim({2, 4 * c, 1, 1})
+                                         .set_stride({4 * c, 1, 4 * c, 4 * c})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+    auto peer_stats_1 = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_dim({2, 4 * c, 1, 1})
+                                         .set_stride({4 * c, 1, 4 * c, 4 * c})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu  = 1e-05f;
+    float momentum_cpu = 1e-01f;
+    auto epsilon       = graph.tensor(epsilon_cpu);
+    auto momentum      = graph.tensor(momentum_cpu);
+
+    auto batchnorm_options = fe::graph::Batchnorm_attributes()
+                                 .set_epsilon(epsilon)
+                                 .set_previous_running_stats(prev_running_mean, prev_running_var, momentum)
+                                 .set_peer_stats({peer_stats_0, peer_stats_1});
+
+    auto [Y, mean, inv_variance, next_running_mean, next_running_var] =
+        graph.batchnorm(X, scale, bias, batchnorm_options);
+    mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    next_running_mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    next_running_var->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    Y->set_output(true);
+
+#if (CUDNN_VERSION < 8700)
+    SKIP("single GPU BN is not supported in cudnn versions prior to 8.7");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("ConvBNFprop requires Ampere and up");
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support(handle).is_good());
+
+    REQUIRE(graph.build_plans(handle).is_good());
+
+    Surface<half> X_tensor(n * c * h * w, false);
+    Surface<float> Mean_tensor(c, false);
+    Surface<float> Var_tensor(c, false);
+    Surface<float> Previous_running_mean_tensor(c, false);
+    Surface<float> Previous_running_var_tensor(c, false);
+    Surface<float> Next_running_mean_tensor(c, false);
+    Surface<float> Next_running_var_tensor(c, false);
+    Surface<float> Scale_tensor(c, false);
+    Surface<float> Bias_tensor(c, false);
+
+    Surface<half> Y_tensor(n * c * h * w, false);
+    Surface<float> Peer_stats_0_tensor(2 * 4 * c, false, true);
+    Surface<float> Peer_stats_1_tensor(2 * 4 * c, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {prev_running_mean, Previous_running_mean_tensor.devPtr},
+        {prev_running_var, Previous_running_var_tensor.devPtr},
+        {next_running_mean, Next_running_mean_tensor.devPtr},
+        {next_running_var, Next_running_var_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr},
+        {peer_stats_0, Peer_stats_0_tensor.devPtr},
+        {peer_stats_1, Peer_stats_1_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/moe_grouped_matmul/moe_grouped_matmul.cpp b/third_party/cudnn-frontend/samples/cpp/moe_grouped_matmul/moe_grouped_matmul.cpp
new file mode 100644
index 00000000..9c4a75d2
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/moe_grouped_matmul/moe_grouped_matmul.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <random>
+
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("WoQ MoeGroupedMatmul", "[MoeGroupedMatmul][graph]") {
+#if (CUDNN_VERSION < 91800)
+    SKIP("MoE is not supported in cudnn versions prior to 9.18.0");
+#endif
+
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // problem size
+    int64_t const batch_size  = 2;
+    int64_t const num_experts = 3;
+    int64_t const top_k       = 2;
+    int64_t const token_num   = 512;
+    int64_t const weight_size = 256;
+    int64_t const hidden_size = 512;
+    int64_t const block_size  = 128;
+
+    // Initialize input tensors
+    Surface<int8_t> token_gpu(
+        div_up(batch_size * token_num * top_k * hidden_size *
+                   cudnn_frontend::detail::get_element_size_in_bits(cudnn_frontend::DataType_t::HALF),
+               8),
+        false);
+    Surface<int8_t> weight_gpu(
+        div_up(num_experts * hidden_size * weight_size *
+                   cudnn_frontend::detail::get_element_size_in_bits(cudnn_frontend::DataType_t::INT4),
+               8),
+        false);
+    Surface<int8_t> block_scale_gpu(
+        div_up(num_experts * div_up(hidden_size, block_size) * weight_size *
+                   cudnn_frontend::detail::get_element_size_in_bits(cudnn_frontend::DataType_t::HALF),
+               8),
+        false);
+    Surface<int8_t> first_token_offset_gpu(
+        div_up(batch_size * num_experts *
+                   cudnn_frontend::detail::get_element_size_in_bits(cudnn_frontend::DataType_t::INT32),
+               8),
+        false);
+    Surface<int8_t> moe_grouped_matmul_gpu(
+        div_up(batch_size * token_num * top_k * weight_size *
+                   cudnn_frontend::detail::get_element_size_in_bits(cudnn_frontend::DataType_t::HALF),
+               8),
+        false);
+
+    std::vector<int32_t> first_token_offset_cpu({0, 128, 512, 768, 1152, 1536});
+    CUDA_CHECK(cudaMemcpy(first_token_offset_gpu.devPtr,
+                          first_token_offset_cpu.data(),
+                          first_token_offset_cpu.size() * sizeof(int32_t),
+                          cudaMemcpyHostToDevice));
+
+    // Make cudnn graph
+    fe::graph::Graph graph{};
+
+    graph.set_intermediate_data_type(fe::DataType_t::HALF);
+    graph.set_compute_data_type(fe::DataType_t::HALF);
+
+    auto tensor_token = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("token")
+                                         .set_dim({1, batch_size * token_num * top_k, hidden_size})
+                                         .set_stride({batch_size * token_num * top_k * hidden_size, hidden_size, 1})
+                                         .set_data_type(fe::DataType_t::HALF));
+
+    auto tensor_weight = graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("weight")
+                                          .set_dim({num_experts, hidden_size, weight_size})
+                                          .set_stride({hidden_size * weight_size, 1, hidden_size})
+                                          .set_data_type(fe::DataType_t::INT4));
+
+    auto tensor_block_scale = graph.tensor(
+        fe::graph::Tensor_attributes()
+            .set_name("block_scale")
+            .set_dim({num_experts, div_up(hidden_size, block_size), weight_size})
+            .set_stride({div_up(hidden_size, block_size) * weight_size, 1, div_up(hidden_size, block_size)})
+            .set_data_type(fe::DataType_t::HALF));
+
+    auto tensor_first_token_offset = graph.tensor(fe::graph::Tensor_attributes()
+                                                      .set_name("first_token_offset")
+                                                      .set_dim({batch_size * num_experts, 1, 1})
+                                                      .set_stride({1, 1, 1})
+                                                      .set_data_type(fe::DataType_t::INT32));
+
+    auto dequantize_weight_attr = fe::graph::Block_scale_dequantize_attributes()
+                                      .set_block_size({block_size, 1})
+                                      .set_compute_data_type(fe::DataType_t::HALF);
+
+    auto tensor_dequantized_weight =
+        graph.block_scale_dequantize(tensor_weight, tensor_block_scale, dequantize_weight_attr);
+    tensor_dequantized_weight->set_data_type(fe::DataType_t::HALF);
+
+    auto moe_grouped_matmul_attr = fe::graph::Moe_grouped_matmul_attributes()
+                                       .set_name("moe_grouped_matmul")
+                                       .set_mode(fe::MoeGroupedMatmulMode_t::NONE)
+                                       .set_compute_data_type(fe::DataType_t::HALF)
+                                       .set_top_k(top_k);
+
+    auto tensor_moe_grouped_matmul = graph.moe_grouped_matmul(
+        tensor_token, tensor_dequantized_weight, tensor_first_token_offset, nullptr, nullptr, moe_grouped_matmul_attr);
+
+    tensor_moe_grouped_matmul->set_data_type(fe::DataType_t::HALF);
+    tensor_moe_grouped_matmul->set_output(true);
+
+    std::cout << graph << std::endl;
+    REQUIRE(graph.validate().is_good());
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+    // Run cudnn graph
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {tensor_token, token_gpu.devPtr},
+        {tensor_weight, weight_gpu.devPtr},
+        {tensor_block_scale, block_scale_gpu.devPtr},
+        {tensor_first_token_offset, first_token_offset_gpu.devPtr},
+        {tensor_moe_grouped_matmul, moe_grouped_matmul_gpu.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/adaptive_layernorm.cpp b/third_party/cudnn-frontend/samples/cpp/norm/adaptive_layernorm.cpp
new file mode 100644
index 00000000..37a1750f
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/adaptive_layernorm.cpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+void
+adalayernorm_fwd_dynamic_shapes(bool train = true) {
+#if (CUDNN_VERSION < 90900)
+    SKIP("Adaptive LayerNorm is not supported in cudnn versions prior to 9.9");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("adaLayerNorm forward requires Ampere and up");
+    }
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // clang-format off
+    struct {
+        int64_t b,    s,    d;
+    } adalayernorm_shapes[] = {
+        {       4, 1024,  128},
+        {       8, 1024,  128},
+        {       4,  512,  128},
+        {       8,  512,  128},
+    };
+    // clang-format on
+
+    constexpr int adalayernorm_shapes_count = sizeof(adalayernorm_shapes) / sizeof(adalayernorm_shapes[0]);
+    int64_t max_x_volume = 0, max_stats_volume = 0, max_weights_volume = 0;
+    for (int idx_shape = 0; idx_shape < adalayernorm_shapes_count; ++idx_shape) {
+        const auto& ln_shape = adalayernorm_shapes[idx_shape];
+        max_x_volume         = std::max(max_x_volume, ln_shape.b * ln_shape.s * ln_shape.d);
+        max_stats_volume     = std::max(max_stats_volume, ln_shape.b * ln_shape.s);
+        max_weights_volume   = std::max(max_weights_volume, ln_shape.b * ln_shape.d);
+    }
+
+    auto kernel_cache = std::make_shared<fe::KernelCache>();
+
+    const auto build_new_graph = [&adalayernorm_shapes, &kernel_cache, &train](cudnnHandle_t handle, int idx_shape) {
+        const auto& ln_shape = adalayernorm_shapes[idx_shape];
+
+        fe::graph::Graph graph;
+        graph.set_io_data_type(fe::DataType_t::BFLOAT16)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        graph.set_dynamic_shape_enabled(true).set_kernel_cache(kernel_cache);
+
+        auto X = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("X")
+                                  .set_dim({ln_shape.b, ln_shape.s, ln_shape.d})
+                                  .set_stride({ln_shape.d * ln_shape.s, ln_shape.d, 1}));
+
+        auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("scale")
+                                      .set_dim({ln_shape.b, 1, ln_shape.d})
+                                      .set_stride({ln_shape.d, ln_shape.d, 1})
+                                      .set_data_type(fe::DataType_t::FLOAT));
+        auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("bias")
+                                     .set_dim({ln_shape.b, 1, ln_shape.d})
+                                     .set_stride({ln_shape.d, ln_shape.d, 1})
+                                     .set_data_type(fe::DataType_t::FLOAT));
+
+        float scalar_epsilon = 1e-05f;
+        fe::graph::Tensor_attributes s_epsilon(scalar_epsilon);
+        auto epsilon = graph.tensor(s_epsilon.set_name("epsilon"));
+
+        auto adalayernorm_options =
+            fe::graph::AdaLayernorm_attributes()
+                .set_forward_phase(train ? fe::NormFwdPhase_t::TRAINING : fe::NormFwdPhase_t::INFERENCE)
+                .set_epsilon(epsilon);
+
+        auto [Y, mean, inv_variance] = graph.adalayernorm(X, scale, bias, adalayernorm_options);
+
+        Y->set_output(true);
+        if (train) {
+            mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+            inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+        }
+
+        std::cout << graph << std::endl;
+        auto status = graph.validate();
+
+        REQUIRE(status.is_good());
+
+        status = graph.build_operation_graph(handle);
+        REQUIRE(status.is_good());
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+        REQUIRE(graph.check_support().is_good());
+
+        REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+        return std::make_tuple(graph, X, scale, bias, Y, mean, inv_variance);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    for (int idx_shape = 0; idx_shape < adalayernorm_shapes_count; idx_shape++) {
+        auto [graph, X, scale, bias, Y, mean, inv_variance] = build_new_graph(handle, idx_shape);
+
+        Surface<half> X_tensor(max_x_volume, false);
+        Surface<float> Scale_tensor(max_weights_volume, false);
+        Surface<float> Bias_tensor(max_weights_volume, false);
+        Surface<half> Y_tensor(max_x_volume, false);
+        Surface<float> Mean_tensor(max_stats_volume, false);
+        Surface<float> Var_tensor(max_stats_volume, false);
+
+        int64_t workspace_size;
+        REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+
+        Surface<int8_t> workspace(workspace_size, false);
+
+        std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack;
+        if (train) {
+            variant_pack = {{X, X_tensor.devPtr},
+                            {scale, Scale_tensor.devPtr},
+                            {bias, Bias_tensor.devPtr},
+                            {Y, Y_tensor.devPtr},
+                            {mean, Mean_tensor.devPtr},
+                            {inv_variance, Var_tensor.devPtr}};
+        } else {
+            variant_pack = {
+                {X, X_tensor.devPtr}, {scale, Scale_tensor.devPtr}, {bias, Bias_tensor.devPtr}, {Y, Y_tensor.devPtr}};
+        }
+        REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    }
+}
+
+TEST_CASE("AdaLayerNorm training dynamic shape", "[adalayernorm][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    adalayernorm_fwd_dynamic_shapes(true);
+}
+
+TEST_CASE("AdaLayerNorm inference dynamic shape", "[adalayernorm][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    adalayernorm_fwd_dynamic_shapes(false);
+}
+
+TEST_CASE("AdaLayerNorm Backward", "[adalayernorm][graph]") {
+#if (CUDNN_VERSION < 90900)
+    SKIP("Adaptive LayerNorm is not supported in cudnn versions prior to 9.9");
+#endif
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size, seq_length, hidden_size})
+                              .set_stride({seq_length * hidden_size, hidden_size, 1}));
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_dim({batch_size, seq_length, hidden_size})
+                               .set_stride({seq_length * hidden_size, hidden_size, 1}));
+
+    auto scale        = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({batch_size, 1, hidden_size})
+                                  .set_stride({hidden_size, hidden_size, 1})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto mean         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mean")
+                                 .set_dim({batch_size, seq_length, 1})
+                                 .set_stride({seq_length, 1, 1})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({batch_size, seq_length, 1})
+                                         .set_stride({seq_length, 1, 1})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto DADALN_options =
+        fe::graph::AdaLayernorm_backward_attributes().set_saved_mean_and_inv_variance(mean, inv_variance);
+    auto [DX, dscale, dbias] = graph.adalayernorm_backward(DY, X, scale, DADALN_options);
+    DX->set_output(true);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dbias->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("adaLayerNorm Backward requires Ampere and up");
+    }
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<half> DY_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Inv_variance_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(batch_size * hidden_size, false);
+    Surface<float> Dscale_tensor(batch_size * hidden_size, false);
+    Surface<float> Dbias_tensor(batch_size * hidden_size, false);
+    Surface<half> DX_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {dbias, Dbias_tensor.devPtr},
+        {DX, DX_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/batchnorm.cpp b/third_party/cudnn-frontend/samples/cpp/norm/batchnorm.cpp
new file mode 100644
index 00000000..5e1ddcc8
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/batchnorm.cpp
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("BN Finalize Graph", "[batchnorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::FLOAT)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto sum =
+        graph.tensor(fe::graph::Tensor_attributes().set_name("sum").set_dim({1, 32, 1, 1}).set_stride({32, 1, 32, 32}));
+    auto sq_sum = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("sq_sum").set_dim({1, 32, 1, 1}).set_stride({32, 1, 32, 32}));
+    auto prev_running_mean = graph.tensor(fe::graph::Tensor_attributes()
+                                              .set_name("prev_running_mean")
+                                              .set_dim({1, 32, 1, 1})
+                                              .set_stride({32, 1, 32, 32}));
+    auto prev_running_var  = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("prev_running_var").set_dim({1, 32, 1, 1}).set_stride({32, 1, 32, 32}));
+    auto scale = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("scale").set_dim({1, 32, 1, 1}).set_stride({32, 1, 32, 32}));
+    auto bias = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("bias").set_dim({1, 32, 1, 1}).set_stride({32, 1, 32, 32}));
+
+    float EPS_scalar      = 0.001f;
+    float MOMENTUM_scalar = 0.001f;
+    int64_t nhw           = 64;
+
+    auto epsilon     = graph.tensor(EPS_scalar);
+    auto momentum    = graph.tensor(MOMENTUM_scalar);
+    auto accum_count = graph.tensor(nhw);
+
+    auto bn_finalize_options =
+        fe::graph::BN_finalize_attributes().set_previous_running_stats(prev_running_mean, prev_running_var, momentum);
+    auto [eq_scale, eq_bias, saved_mean, saved_inv_variance, next_running_mean, next_running_var] =
+        graph.bn_finalize(sum, sq_sum, scale, bias, epsilon, accum_count, bn_finalize_options);
+    eq_scale->set_output(true);
+    eq_bias->set_output(true);
+    saved_mean->set_output(true);
+    saved_inv_variance->set_output(true);
+    next_running_mean->set_output(true);
+    next_running_var->set_output(true);
+
+#if (CUDNN_VERSION < 8400)
+    SKIP("BNFinalize requires cudnn 8.4 and up");
+#endif
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<float> Sum_tensor(32, false);
+    Surface<float> Sq_sum_tensor(32, false);
+    Surface<float> Mean_tensor(32, false);
+    Surface<float> Var_tensor(32, false);
+    Surface<float> Previous_running_mean_tensor(32, false);
+    Surface<float> Previous_running_var_tensor(32, false);
+    Surface<float> Next_running_mean_tensor(32, false);
+    Surface<float> Next_running_var_tensor(32, false);
+    Surface<float> Scale_tensor(32, false);
+    Surface<float> Bias_tensor(32, false);
+    Surface<float> eq_scale_tensor(32, false);
+    Surface<float> eq_bias_tensor(32, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {sum, Sum_tensor.devPtr},
+        {sq_sum, Sq_sum_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {prev_running_mean, Previous_running_mean_tensor.devPtr},
+        {prev_running_var, Previous_running_var_tensor.devPtr},
+        {eq_scale, eq_scale_tensor.devPtr},
+        {eq_bias, eq_bias_tensor.devPtr},
+        {saved_mean, Mean_tensor.devPtr},
+        {saved_inv_variance, Var_tensor.devPtr},
+        {next_running_mean, Next_running_mean_tensor.devPtr},
+        {next_running_var, Next_running_var_tensor.devPtr}};
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("SGBN Add Relu Graph", "[batchnorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    bool has_running_stats = true;
+    auto X                 = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({4, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+    auto prev_running_mean = graph.tensor(fe::graph::Tensor_attributes()
+                                              .set_name("prev_running_mean")
+                                              .set_dim({1, 32, 1, 1})
+                                              .set_stride({32, 1, 32, 32})
+                                              .set_data_type(fe::DataType_t::FLOAT));
+    auto prev_running_var  = graph.tensor(fe::graph::Tensor_attributes()
+                                             .set_name("prev_running_var")
+                                             .set_dim({1, 32, 1, 1})
+                                             .set_stride({32, 1, 32, 32})
+                                             .set_data_type(fe::DataType_t::FLOAT));
+    auto scale             = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, 32, 1, 1})
+                                  .set_stride({32, 1, 32, 32})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias              = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, 32, 1, 1})
+                                 .set_stride({32, 1, 32, 32})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    auto peer_stats_0 = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_dim({2, 4 * 32, 1, 1})
+                                         .set_stride({4 * 32, 1, 4 * 32, 4 * 32})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+    auto peer_stats_1 = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_dim({2, 4 * 32, 1, 1})
+                                         .set_stride({4 * 32, 1, 4 * 32, 4 * 32})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto epsilon  = graph.tensor(1e-05f);
+    auto momentum = graph.tensor(1e-01f);
+
+    auto batchnorm_options =
+        fe::graph::Batchnorm_attributes().set_epsilon(epsilon).set_peer_stats({peer_stats_0, peer_stats_1});
+    if (has_running_stats) {
+        batchnorm_options.set_previous_running_stats(prev_running_mean, prev_running_var, momentum);
+    }
+
+    auto [bn_output, mean, inv_variance, next_running_mean, next_running_var] =
+        graph.batchnorm(X, scale, bias, batchnorm_options);
+    mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    if (has_running_stats) {
+        next_running_mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    }
+    if (has_running_stats) {
+        next_running_var->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    }
+
+    auto A           = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("A")
+                              .set_dim({4, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                              .set_data_type(fe::DataType_t::HALF));
+    auto add_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+    auto add_output  = graph.pointwise(bn_output, A, add_options);
+
+    auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+    auto Y            = graph.pointwise(add_output, relu_options);
+    Y->set_output(true);
+
+#if (CUDNN_VERSION < 8700)
+    SKIP("single GPU BN is not supported in cudnn versions prior to 8.7");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("ConvBNFprop requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(4 * 32 * 16 * 16, false);
+    Surface<float> Mean_tensor(32, false);
+    Surface<float> Var_tensor(32, false);
+    Surface<float> Previous_running_mean_tensor(32, false);
+    Surface<float> Previous_running_var_tensor(32, false);
+    Surface<float> Next_running_mean_tensor(32, false);
+    Surface<float> Next_running_var_tensor(32, false);
+    Surface<float> Scale_tensor(32, false);
+    Surface<float> Bias_tensor(32, false);
+    Surface<half> A_tensor(4 * 32 * 16 * 16, false);
+    Surface<half> Y_tensor(4 * 32 * 16 * 16, false);
+    Surface<float> Peer_stats_0_tensor(2 * 4 * 32, false, true);
+    Surface<float> Peer_stats_1_tensor(2 * 4 * 32, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {A, A_tensor.devPtr},
+        {Y, Y_tensor.devPtr},
+        {peer_stats_0, Peer_stats_0_tensor.devPtr},
+        {peer_stats_1, Peer_stats_1_tensor.devPtr}};
+
+    if (has_running_stats) {
+        variant_pack[prev_running_mean] = Previous_running_mean_tensor.devPtr;
+        variant_pack[prev_running_var]  = Previous_running_var_tensor.devPtr;
+        variant_pack[next_running_mean] = Next_running_mean_tensor.devPtr;
+        variant_pack[next_running_var]  = Next_running_var_tensor.devPtr;
+    }
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("DBN Add Relu Graph", "[BN][graph][backward]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_dim({4, 32, 16, 16})
+                               .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+
+    auto input_mask = graph.tensor(fe::graph::Tensor_attributes()
+                                       .set_name("Mask")
+                                       .set_dim({4, 32, 16, 16})
+                                       .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                                       .set_data_type(fe::DataType_t::BOOLEAN));
+
+    auto mul_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto DX_drelu    = graph.pointwise(DY, input_mask, mul_options);
+
+    // NOTE: Toggle DADD output by toggling DX_DRELU virtualness
+    bool has_dadd = true;
+    DX_drelu->set_output(has_dadd).set_data_type(fe::DataType_t::HALF);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({4, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+
+    auto scale        = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, 32, 1, 1})
+                                  .set_stride({32, 1, 32, 32})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto mean         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mean")
+                                 .set_dim({1, 32, 1, 1})
+                                 .set_stride({32, 1, 32, 32})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({1, 32, 1, 1})
+                                         .set_stride({32, 1, 32, 32})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto peer_stats_0 = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_dim({2, 4 * 32, 1, 1})
+                                         .set_stride({4 * 32, 1, 4 * 32, 4 * 32})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+    auto peer_stats_1 = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_dim({2, 4 * 32, 1, 1})
+                                         .set_stride({4 * 32, 1, 4 * 32, 4 * 32})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto DBN_options = fe::graph::Batchnorm_backward_attributes()
+                           .set_saved_mean_and_inv_variance(mean, inv_variance)
+                           .set_peer_stats({peer_stats_0, peer_stats_1});
+    auto [DX, dscale, dbias] = graph.batchnorm_backward(DX_drelu, X, scale, DBN_options);
+    DX->set_output(true);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dbias->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+#if (CUDNN_VERSION < 8900)
+    SKIP("single GPU BN is not supported in cudnn versions prior to 8.7");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("BatchNorm Backward requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+    Surface<half> X_tensor(4 * 32 * 16 * 16, false);
+    Surface<int8_t> Mask_tensor(4 * 32 * 16 * 16 / 8, false);
+    Surface<half> DY_tensor(4 * 32 * 16 * 16, false);
+    Surface<float> Mean_tensor(32, false);
+    Surface<float> Inv_variance_tensor(32, false);
+    Surface<float> Scale_tensor(32, false);
+    Surface<float> Dscale_tensor(32, false);
+    Surface<float> Dbias_tensor(32, false);
+    Surface<half> DX_tensor(4 * 32 * 16 * 16, false);
+    Surface<float> Peer_stats_0_tensor(2 * 4 * 32, false, true);
+    Surface<float> Peer_stats_1_tensor(2 * 4 * 32, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {input_mask, Mask_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {dbias, Dbias_tensor.devPtr},
+        {DX, DX_tensor.devPtr},
+        {peer_stats_0, Peer_stats_0_tensor.devPtr},
+        {peer_stats_1, Peer_stats_1_tensor.devPtr}};
+
+    // If is_dx_drelu_virtual, DADD output required
+    Surface<half> DADD_tensor(4 * 32 * 16 * 16, false);
+    if (true == has_dadd) {
+        variant_pack[DX_drelu] = DADD_tensor.devPtr;
+    }
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("BN_inference DRelu DBN Graph", "[Batchnorm][graph][backward]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto BN_X = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("X")
+                                 .set_dim({4, 32, 16, 16})
+                                 .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+
+    auto scale        = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, 32, 1, 1})
+                                  .set_stride({32, 1, 32, 32})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, 32, 1, 1})
+                                 .set_stride({32, 1, 32, 32})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto mean         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mean")
+                                 .set_dim({1, 32, 1, 1})
+                                 .set_stride({32, 1, 32, 32})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({1, 32, 1, 1})
+                                         .set_stride({32, 1, 32, 32})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto batchnorm_inference_attributes = fe::graph::Batchnorm_inference_attributes();
+    auto BN_Y = graph.batchnorm_inference(BN_X, mean, inv_variance, scale, bias, batchnorm_inference_attributes);
+
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_dim({4, 32, 16, 16})
+                               .set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+
+    auto relu_backward_attribues = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_BWD);
+    auto DX_drelu                = graph.pointwise(DY, BN_Y, relu_backward_attribues);
+    DX_drelu->set_data_type(fe::DataType_t::HALF);
+
+    auto DBN_options = fe::graph::Batchnorm_backward_attributes().set_saved_mean_and_inv_variance(mean, inv_variance);
+    auto [DX, dscale, dbias] = graph.batchnorm_backward(DX_drelu, BN_X, scale, DBN_options);
+    DX->set_output(true);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dbias->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+#if (CUDNN_VERSION < 8904)
+    SKIP("BN_infer->Drelu->DBN is not supported in cudnn versions prior to 8.9.4");
+#endif
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> BN_X_tensor(4 * 32 * 16 * 16, false);
+    Surface<half> DY_tensor(4 * 32 * 16 * 16, false);
+    Surface<float> Mean_tensor(32, false);
+    Surface<float> Inv_variance_tensor(32, false);
+    Surface<float> Scale_tensor(32, false);
+    Surface<float> Bias_tensor(32, false);
+    Surface<float> Dscale_tensor(32, false);
+    Surface<float> Dbias_tensor(32, false);
+    Surface<half> DX_tensor(4 * 32 * 16 * 16, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {BN_X, BN_X_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {dbias, Dbias_tensor.devPtr},
+        {DX, DX_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/layernorm.cpp b/third_party/cudnn-frontend/samples/cpp/norm/layernorm.cpp
new file mode 100644
index 00000000..0e02ee4e
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/layernorm.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+void
+layernorm_fwd_dynamic_shapes(bool train = true) {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by current cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // clang-format off
+    struct {
+        int64_t b,    s,    d;
+    } layernorm_shapes[] = {
+        {       4, 1024,  128},
+        {       8, 1024,  128},
+        {       4,  512,  128},
+        {       8,  512,  128},
+    };
+    // clang-format on
+
+    constexpr int layernorm_shapes_count = sizeof(layernorm_shapes) / sizeof(layernorm_shapes[0]);
+    int64_t max_x_volume = 0, max_stats_volume = 0, max_weights_volume = 0;
+    for (int idx_shape = 0; idx_shape < layernorm_shapes_count; ++idx_shape) {
+        const auto& ln_shape = layernorm_shapes[idx_shape];
+        max_x_volume         = std::max(max_x_volume, ln_shape.b * ln_shape.s * ln_shape.d);
+        max_stats_volume     = std::max(max_stats_volume, ln_shape.b * ln_shape.s);
+        max_weights_volume   = std::max(max_weights_volume, ln_shape.d);
+    }
+
+    auto kernel_cache = std::make_shared<fe::KernelCache>();
+
+    const auto build_new_graph = [&layernorm_shapes, &kernel_cache, &train](cudnnHandle_t handle, int idx_shape) {
+        const auto& ln_shape = layernorm_shapes[idx_shape];
+
+        fe::graph::Graph graph;
+        graph.set_io_data_type(fe::DataType_t::BFLOAT16)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT);
+
+        graph.set_dynamic_shape_enabled(true).set_kernel_cache(kernel_cache);
+
+        auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("X")
+                                  .set_dim({ln_shape.b * ln_shape.s, ln_shape.d, 1, 1})
+                                  .set_stride({ln_shape.d, 1, ln_shape.d, ln_shape.d}));
+        auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("scale")
+                                      .set_dim({1, ln_shape.d, 1, 1})
+                                      .set_stride({ln_shape.d, 1, ln_shape.d, ln_shape.d})
+                                      .set_data_type(fe::DataType_t::FLOAT));
+        auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                     .set_name("bias")
+                                     .set_dim({1, ln_shape.d, 1, 1})
+                                     .set_stride({ln_shape.d, 1, ln_shape.d, ln_shape.d})
+                                     .set_data_type(fe::DataType_t::FLOAT));
+
+        float epsilon_cpu = 1e-05f;
+        auto epsilon      = graph.tensor(epsilon_cpu);
+
+        auto layernorm_options =
+            fe::graph::Layernorm_attributes()
+                .set_forward_phase(train ? fe::NormFwdPhase_t::TRAINING : fe::NormFwdPhase_t::INFERENCE)
+                .set_epsilon(epsilon);
+        auto [Y, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
+
+        Y->set_output(true);
+        if (train) {
+            mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+            inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+        }
+
+        std::cout << graph << std::endl;
+        auto status = graph.validate();
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Dynamic shapes not supported pre 9.4");
+        }
+
+        status = graph.build_operation_graph(handle);
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Kernel cache not supported pre 9.4");
+        }
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+        REQUIRE(graph.check_support().is_good());
+
+        REQUIRE(graph.build_plans(fe::BuildPlanPolicy_t::ALL).is_good());
+
+        return std::make_tuple(graph, X, scale, bias, Y, mean, inv_variance);
+    };
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    for (int idx_shape = 0; idx_shape < layernorm_shapes_count; idx_shape++) {
+        auto [graph, X, scale, bias, Y, mean, inv_variance] = build_new_graph(handle, idx_shape);
+
+        Surface<half> X_tensor(max_x_volume, false);
+        Surface<float> Scale_tensor(max_weights_volume, false);
+        Surface<float> Bias_tensor(max_weights_volume, false);
+        Surface<half> Y_tensor(max_x_volume, false);
+        Surface<float> Mean_tensor(max_stats_volume, false);
+        Surface<float> Var_tensor(max_stats_volume, false);
+
+        int64_t workspace_size = 0;
+        REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+        Surface<int8_t> workspace(workspace_size, false);
+
+        std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack;
+        if (train) {
+            variant_pack = {{X, X_tensor.devPtr},
+                            {scale, Scale_tensor.devPtr},
+                            {bias, Bias_tensor.devPtr},
+                            {Y, Y_tensor.devPtr},
+                            {mean, Mean_tensor.devPtr},
+                            {inv_variance, Var_tensor.devPtr}};
+        } else {
+            variant_pack = {
+                {X, X_tensor.devPtr}, {scale, Scale_tensor.devPtr}, {bias, Bias_tensor.devPtr}, {Y, Y_tensor.devPtr}};
+        }
+        REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    }
+}
+
+TEST_CASE("LayerNorm training dynamic shape", "[layernorm][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    layernorm_fwd_dynamic_shapes(true);
+}
+
+TEST_CASE("LayerNorm inference dynamic shape", "[layernorm][graph][dynamic_shape]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    layernorm_fwd_dynamic_shapes(false);
+}
+
+TEST_CASE("LayerNorm Training", "[layernorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::TRAINING).set_epsilon(epsilon);
+    auto [Y, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
+    mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    Y->set_output(true);
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("LayerNorm is not supported in cudnn versions prior to 8.9.5");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Var_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<half> Y_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("LayerNorm Inference", "[layernorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::INFERENCE).set_epsilon(epsilon);
+    auto [Y, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
+    Y->set_output(true);
+
+    REQUIRE(mean == nullptr);
+    REQUIRE(inv_variance == nullptr);
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("LayerNorm is not supported in cudnn versions prior to 8.9.5");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<half> Y_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr}, {scale, Scale_tensor.devPtr}, {bias, Bias_tensor.devPtr}, {Y, Y_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("LayerNorm Backward", "[layernorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                               .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+
+    auto scale        = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto mean         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mean")
+                                 .set_dim({batch_size * seq_length, 1, 1, 1})
+                                 .set_stride({1, 1, 1, 1})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({batch_size * seq_length, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto DLN_options = fe::graph::Layernorm_backward_attributes().set_saved_mean_and_inv_variance(mean, inv_variance);
+    auto [DX, dscale, dbias] = graph.layernorm_backward(DY, X, scale, DLN_options);
+    DX->set_output(true);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dbias->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+#if (CUDNN_VERSION < 8905)
+    SKIP("single GPU BN is not supported in cudnn versions prior to 8.7");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm Backward requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<half> DY_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Inv_variance_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Dscale_tensor(hidden_size, false);
+    Surface<float> Dbias_tensor(hidden_size, false);
+    Surface<half> DX_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {dbias, Dbias_tensor.devPtr},
+        {DX, DX_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/layernorm_bitmask_relu.cpp b/third_party/cudnn-frontend/samples/cpp/norm/layernorm_bitmask_relu.cpp
new file mode 100644
index 00000000..235f0a74
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/layernorm_bitmask_relu.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Forward Training LayerNorm and Bitmask Clamped ReLU", "[layernorm][graph][clamped_relu_bitmask]") {
+    // Compatibility checks
+    if constexpr (CUDNN_VERSION < 91300) {
+        SKIP("LayerNorm with relu using bitmask is not supported in cudnn versions prior to 9.13.0");
+    }
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm requires Ampere and up");
+    }
+
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::FLOAT)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float scalar_epsilon = 1e-05f;
+    fe::graph::Tensor_attributes s_epsilon(scalar_epsilon);
+    auto epsilon = graph.tensor(s_epsilon.set_name("epsilon"));
+
+    float lower_clip = 0.0f;
+    float upper_clip = 6.0f;
+    fe::graph::Tensor_attributes s_lower_clip(lower_clip);
+    fe::graph::Tensor_attributes s_upper_clip(upper_clip);
+    auto relu_lower_bound = graph.tensor(s_lower_clip.set_name("relu_lower_bound"));
+    auto relu_upper_bound = graph.tensor(s_upper_clip.set_name("relu_upper_bound"));
+
+    // Apply LayerNorm
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::TRAINING).set_epsilon(epsilon);
+    auto [ln_output, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
+    mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    // Apply clamped ReLU to the LayerNorm output
+    auto relu_attributes = fe::graph::Pointwise_attributes()
+                               .set_mode(fe::PointwiseMode_t::RELU_FWD)
+                               .set_compute_data_type(fe::DataType_t::FLOAT)
+                               .set_relu_lower_clip(lower_clip)
+                               .set_relu_upper_clip(upper_clip);
+    auto Y = graph.pointwise(ln_output, relu_attributes);
+    Y->set_output(true);
+    Y->set_name("ReLU(Y)");
+
+    // Generate bitmask for clamped ReLU
+    auto relu_lower_clip_mask_attr = fe::graph::Pointwise_attributes()
+                                         .set_mode(fe::PointwiseMode_t::CMP_GT)
+                                         .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto lower_mask = graph.pointwise(Y, relu_lower_bound, relu_lower_clip_mask_attr);
+    lower_mask->set_data_type(fe::DataType_t::BOOLEAN);
+    lower_mask->set_name("lower_mask");
+
+    auto relu_upper_clip_mask_attr = fe::graph::Pointwise_attributes()
+                                         .set_mode(fe::PointwiseMode_t::CMP_LT)
+                                         .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto upper_mask = graph.pointwise(Y, relu_upper_bound, relu_upper_clip_mask_attr);
+    upper_mask->set_data_type(fe::DataType_t::BOOLEAN);
+    upper_mask->set_name("upper_mask");
+
+    auto logical_and_attr = fe::graph::Pointwise_attributes()
+                                .set_mode(fe::PointwiseMode_t::LOGICAL_AND)
+                                .set_compute_data_type(fe::DataType_t::BOOLEAN);
+    auto bitmask = graph.pointwise(lower_mask, upper_mask, logical_and_attr);
+    bitmask->set_data_type(fe::DataType_t::BOOLEAN);
+    bitmask->set_name("relu_bitmask");
+    bitmask->set_output(true);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Print the graph
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support(handle).is_good());
+
+    REQUIRE(graph.build_plans(handle).is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Var_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<float> Y_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<uint8_t> Relu_Bitmask_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr},
+        {bitmask, Relu_Bitmask_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("Clamped DReLU using Bitmask and Backward LayerNorm", "[layernorm][graph][DRelu_bitmask_DLN]") {
+    // Compatibility checks
+    if constexpr (CUDNN_VERSION < 91300) {
+        SKIP("LayerNorm with relu using bitmask is not supported in cudnn versions prior to 9.13.0");
+    }
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm requires Ampere and up");
+    }
+
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::FLOAT)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                               .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+
+    auto scale        = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto mean         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mean")
+                                 .set_dim({batch_size * seq_length, 1, 1, 1})
+                                 .set_stride({1, 1, 1, 1})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({batch_size * seq_length, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+    auto mask         = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mask")
+                                 .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::BOOLEAN));
+
+    auto mul_options               = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto applied_bitmask_DY_output = graph.pointwise(DY, mask, mul_options);
+
+    auto DLN_options = fe::graph::Layernorm_backward_attributes().set_saved_mean_and_inv_variance(mean, inv_variance);
+    auto [DX, dscale, dbias] = graph.layernorm_backward(applied_bitmask_DY_output, X, scale, DLN_options);
+    DX->set_output(true);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dbias->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Print the graph
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph.check_support(handle).is_good());
+
+    REQUIRE(graph.build_plans(handle).is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> DY_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Inv_variance_tensor(batch_size * seq_length, false);
+    Surface<uint8_t> Mask_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Dscale_tensor(hidden_size, false);
+    Surface<float> Dbias_tensor(hidden_size, false);
+    Surface<float> DX_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {mask, Mask_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {dbias, Dbias_tensor.devPtr},
+        {DX, DX_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/norm_block_scale.cpp b/third_party/cudnn-frontend/samples/cpp/norm/norm_block_scale.cpp
new file mode 100644
index 00000000..30f344ac
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/norm_block_scale.cpp
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("LayerNorm Training MXFP8 with reshape", "[layernorm][graph][block_scale]") {
+    namespace fe = cudnn_frontend;
+
+#if (CUDNN_VERSION < 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+    SKIP("MXFP8 is not supported in cudnn versions prior to 9.7.0");
+#endif
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+    auto block_size  = 32;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size, seq_length, hidden_size, 1})
+                              .set_stride({seq_length * hidden_size, hidden_size, 1, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, 1, hidden_size, 1})
+                                  .set_stride({hidden_size, hidden_size, 1, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, 1, hidden_size, 1})
+                                 .set_stride({hidden_size, hidden_size, 1, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::TRAINING).set_epsilon(epsilon);
+    auto [Y_ln, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
+    mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    auto Y_ln_2d = graph.reshape(Y_ln, fe::graph::Reshape_attributes());
+    Y_ln_2d->set_dim({batch_size * seq_length, hidden_size});
+
+    auto mxfp8_quantize_row_opts =
+        fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(1).set_transpose(false);
+    auto [Y_row, mx_row] = graph.block_scale_quantize(Y_ln_2d, mxfp8_quantize_row_opts);
+    Y_row->set_output(true).set_data_type(fe::DataType_t::FP8_E5M2);
+    mx_row->set_output(true).set_data_type(fe::DataType_t::FP8_E8M0);
+
+    auto mxfp8_quantize_col_opts =
+        fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(0).set_transpose(false);
+    auto [Y_col, mx_col] = graph.block_scale_quantize(Y_ln_2d, mxfp8_quantize_col_opts);
+    Y_col->set_output(true).set_data_type(fe::DataType_t::FP8_E5M2);
+    mx_col->set_output(true).set_data_type(fe::DataType_t::FP8_E8M0);
+
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("MXFP8 requires Blackwell and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Var_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<int8_t> Y_row_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<int8_t> mx_row_tensor(batch_size * seq_length * hidden_size / block_size, false);
+    Surface<int8_t> Y_col_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<int8_t> mx_col_tensor(batch_size * seq_length * hidden_size / block_size, false);
+
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y_row, Y_row_tensor.devPtr},
+        {mx_row, mx_row_tensor.devPtr},
+        {Y_col, Y_col_tensor.devPtr},
+        {mx_col, mx_col_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("LayerNorm Inference MXFP8", "[layernorm][graph][block_scale]") {
+    namespace fe = cudnn_frontend;
+
+#if (CUDNN_VERSION < 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+    SKIP("MXFP8 is not supported in cudnn versions prior to 9.7.0");
+#endif
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+    auto block_size  = 32;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::INFERENCE).set_epsilon(epsilon);
+    auto [Y_ln, mean, inv_variance] = graph.layernorm(X, scale, bias, layernorm_options);
+
+    auto mxfp8_quantize_opts = fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(1);
+    auto [Y, mx_scale]       = graph.block_scale_quantize(Y_ln, mxfp8_quantize_opts);
+    Y->set_output(true).set_data_type(fe::DataType_t::FP8_E5M2);
+    mx_scale->set_output(true).set_data_type(fe::DataType_t::FP8_E8M0);
+
+    REQUIRE(mean == nullptr);
+    REQUIRE(inv_variance == nullptr);
+
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("MXFP8 requires Blackwell and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<int8_t> Y_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<int8_t> mx_scale_tensor(batch_size * seq_length * hidden_size / block_size, false);
+
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr},
+        {mx_scale, mx_scale_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("RmsNorm Training MXFP8", "[rmsnorm][graph][block_scale]") {
+    namespace fe = cudnn_frontend;
+
+#if (CUDNN_VERSION < 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+    SKIP("MXFP8 is not supported in cudnn versions prior to 9.7.0");
+#endif
+
+    fe::graph::Graph graph;
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+    auto block_size  = 32;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_data_type(fe::DataType_t::FLOAT)
+                              .set_dim({batch_size, seq_length, hidden_size, 1})
+                              .set_stride({seq_length * hidden_size, hidden_size, 1, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, 1, hidden_size, 1})
+                                  .set_stride({hidden_size, hidden_size, 1, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto rmsnorm_options =
+        fe::graph::Rmsnorm_attributes().set_forward_phase(fe::NormFwdPhase_t::TRAINING).set_epsilon(epsilon);
+    auto [Y_ln, inv_variance] = graph.rmsnorm(X, scale, rmsnorm_options);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    auto mxfp8_quantize_row_opts =
+        fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(2).set_transpose(false);
+    auto [Y_row, mx_row] = graph.block_scale_quantize(Y_ln, mxfp8_quantize_row_opts);
+    Y_row->set_output(true).set_data_type(fe::DataType_t::FP8_E5M2);
+    mx_row->set_output(true).set_data_type(fe::DataType_t::FP8_E8M0);
+
+    auto mxfp8_quantize_col_opts =
+        fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(1).set_transpose(true);
+    auto [Y_col, mx_col] = graph.block_scale_quantize(Y_ln, mxfp8_quantize_col_opts);
+    Y_col->set_output(true).set_data_type(fe::DataType_t::FP8_E5M2);
+    mx_col->set_output(true).set_data_type(fe::DataType_t::FP8_E8M0);
+
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("MXFP8 requires Blackwell and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Var_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<int8_t> Y_row_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<int8_t> mx_row_tensor(batch_size * seq_length * hidden_size / block_size, false);
+    Surface<int8_t> Y_col_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<int8_t> mx_col_tensor(batch_size * seq_length * hidden_size / block_size, false);
+
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {Y_row, Y_row_tensor.devPtr},
+        {mx_row, mx_row_tensor.devPtr},
+        {Y_col, Y_col_tensor.devPtr},
+        {mx_col, mx_col_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("RmsNorm Inference NVFP4", "[rmsnorm][graph][block_scale]") {
+    namespace fe = cudnn_frontend;
+
+#if (CUDNN_VERSION < 90700)  // TODO: v9.99 is new feature branch; switch to release branch when ready
+    SKIP("NVFP4 is not supported in cudnn versions prior to 9.7.0");
+#endif
+
+    fe::graph::Graph graph;
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+    auto block_size  = 16;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_data_type(fe::DataType_t::FLOAT)
+                              .set_dim({batch_size, seq_length, hidden_size, 1})
+                              .set_stride({seq_length * hidden_size, hidden_size, 1, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, 1, hidden_size, 1})
+                                  .set_stride({hidden_size, hidden_size, 1, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, 1, hidden_size, 1})
+                                 .set_stride({hidden_size, hidden_size, 1, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto rmsnorm_options = fe::graph::Rmsnorm_attributes()
+                               .set_forward_phase(fe::NormFwdPhase_t::INFERENCE)
+                               .set_bias(bias)
+                               .set_epsilon(epsilon);
+    auto [Y_ln, inv_variance] = graph.rmsnorm(X, scale, rmsnorm_options);
+    REQUIRE(inv_variance == nullptr);
+
+    auto nvfp4_quantize_opts =
+        fe::graph::Block_scale_quantize_attributes().set_block_size(block_size).set_axis(2).set_transpose(false);
+    auto [Y, mx_scale] = graph.block_scale_quantize(Y_ln, nvfp4_quantize_opts);
+    Y->set_output(true).set_data_type(fe::DataType_t::FP4_E2M1);
+    mx_scale->set_output(true).set_data_type(fe::DataType_t::FP8_E4M3);
+
+    if (check_device_arch_newer_than("blackwell") == false) {
+        SKIP("NVFP4 requires Blackwell and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<int8_t> Y_tensor(batch_size * seq_length * hidden_size / 2, false);
+    Surface<int8_t> mx_scale_tensor(batch_size * seq_length * hidden_size / block_size, false);
+
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr},
+        {mx_scale, mx_scale_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/norm_zero_centered_gamma.cpp b/third_party/cudnn-frontend/samples/cpp/norm/norm_zero_centered_gamma.cpp
new file mode 100644
index 00000000..e22b35e1
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/norm_zero_centered_gamma.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+// This sample file uses zero centered gamma with layernorm but you can also use it with adalayernorm or rmsnorm
+
+TEST_CASE("LayerNorm Zero Centered Gamma Training", "[layernorm][graph][zero_centered_gamma]") {
+    namespace fe = cudnn_frontend;
+#if (CUDNN_VERSION < 90700)
+    SKIP("Zero centered gamma is not supported in cudnn versions prior to 9.7.0");
+#endif
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X                   = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale_zero_centered = graph.tensor(fe::graph::Tensor_attributes()
+                                                .set_name("scale_zero_centered")
+                                                .set_dim({1, hidden_size, 1, 1})
+                                                .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                                .set_data_type(fe::DataType_t::FLOAT));
+    auto bias                = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    float scalar_epsilon     = 1e-05f;
+    fe::graph::Tensor_attributes s_epsilon(scalar_epsilon);
+    auto epsilon = graph.tensor(s_epsilon.set_name("epsilon"));
+
+    float scalar_one = 1.0f;
+    fe::graph::Tensor_attributes s_one(scalar_one);
+    auto one = graph.tensor(s_one.set_name("one"));
+
+    // Pointwise add operation for scale_zero_centered
+    auto pw_add_attributes = fe::graph::Pointwise_attributes()
+                                 .set_mode(fe::PointwiseMode_t::ADD)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto scale_add_one = graph.pointwise(scale_zero_centered, one, pw_add_attributes);
+    scale_add_one->set_data_type(fe::DataType_t::FLOAT).set_dim({1, hidden_size, 1, 1});
+
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::TRAINING).set_epsilon(epsilon);
+    auto [Y, mean, inv_variance] = graph.layernorm(X, scale_add_one, bias, layernorm_options);
+    mean->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    Y->set_output(true);
+
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Var_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<half> Y_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Var_tensor.devPtr},
+        {scale_zero_centered, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("LayerNorm Zero Centered Gamma Inference", "[layernorm][graph][zero_centered_gamma]") {
+    namespace fe = cudnn_frontend;
+#if (CUDNN_VERSION < 90700)
+    SKIP("Zero centered gamma is not supported in cudnn versions prior to 9.7.0");
+#endif
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X                   = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale_zero_centered = graph.tensor(fe::graph::Tensor_attributes()
+                                                .set_name("scale_zero_centered")
+                                                .set_dim({1, hidden_size, 1, 1})
+                                                .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                                .set_data_type(fe::DataType_t::FLOAT));
+    auto bias                = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float scalar_epsilon = 1e-05f;
+    fe::graph::Tensor_attributes s_epsilon(scalar_epsilon);
+    auto epsilon = graph.tensor(s_epsilon.set_name("epsilon"));
+
+    float scalar_one = 1.0f;
+    fe::graph::Tensor_attributes s_one(scalar_one);
+    auto one = graph.tensor(s_one.set_name("one"));
+
+    // Pointwise add operation for scale_zero_centered
+    auto pw_add_attributes = fe::graph::Pointwise_attributes()
+                                 .set_mode(fe::PointwiseMode_t::ADD)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto scale_add_one = graph.pointwise(scale_zero_centered, one, pw_add_attributes);
+    scale_add_one->set_data_type(fe::DataType_t::FLOAT).set_dim({1, hidden_size, 1, 1});
+
+    auto layernorm_options =
+        fe::graph::Layernorm_attributes().set_forward_phase(fe::NormFwdPhase_t::INFERENCE).set_epsilon(epsilon);
+    auto [Y, mean, inv_variance] = graph.layernorm(X, scale_add_one, bias, layernorm_options);
+    Y->set_output(true);
+
+    REQUIRE(mean == nullptr);
+    REQUIRE(inv_variance == nullptr);
+
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    std::cout << graph << std::endl;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<half> Y_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {scale_zero_centered, Scale_tensor.devPtr},
+        {bias, Bias_tensor.devPtr},
+        {Y, Y_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("LayerNorm Zero Centered Gamma Backward", "[layernorm][graph][zero_centered_gamma]") {
+    namespace fe = cudnn_frontend;
+#if (CUDNN_VERSION < 90700)
+    SKIP("Zero centered gamma is not supported in cudnn versions prior to 9.7.0");
+#endif
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                               .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+
+    auto scale_zero_centered = graph.tensor(fe::graph::Tensor_attributes()
+                                                .set_name("scale_zero_centered")
+                                                .set_dim({1, hidden_size, 1, 1})
+                                                .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                                .set_data_type(fe::DataType_t::FLOAT));
+    auto mean                = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("mean")
+                                 .set_dim({batch_size * seq_length, 1, 1, 1})
+                                 .set_stride({1, 1, 1, 1})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance        = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({batch_size * seq_length, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    float scalar_one = 1.0f;
+    fe::graph::Tensor_attributes s_one(scalar_one);
+    auto one = graph.tensor(s_one.set_name("one"));
+    // Pointwise add operation for scale_zero_centered
+    auto pw_add_attributes = fe::graph::Pointwise_attributes()
+                                 .set_mode(fe::PointwiseMode_t::ADD)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto scale_add_one = graph.pointwise(scale_zero_centered, one, pw_add_attributes);
+    scale_add_one->set_data_type(fe::DataType_t::FLOAT).set_dim({1, hidden_size, 1, 1});
+
+    auto DLN_options = fe::graph::Layernorm_backward_attributes().set_saved_mean_and_inv_variance(mean, inv_variance);
+    auto [DX, dscale, dbias] = graph.layernorm_backward(DY, X, scale_add_one, DLN_options);
+    DX->set_output(true);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dbias->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("LayerNorm Backward requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<half> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<half> DY_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Inv_variance_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Dscale_tensor(hidden_size, false);
+    Surface<float> Dbias_tensor(hidden_size, false);
+    Surface<half> DX_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {mean, Mean_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {scale_zero_centered, Scale_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {dbias, Dbias_tensor.devPtr},
+        {DX, DX_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/norm/rmsnorm.cpp b/third_party/cudnn-frontend/samples/cpp/norm/rmsnorm.cpp
new file mode 100644
index 00000000..0282013d
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/norm/rmsnorm.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("RmsNorm Training", "[rmsnorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_data_type(fe::DataType_t::FLOAT)
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto rmsnorm_options =
+        fe::graph::Rmsnorm_attributes().set_forward_phase(fe::NormFwdPhase_t::TRAINING).set_epsilon(epsilon);
+    auto [Y, inv_variance] = graph.rmsnorm(X, scale, rmsnorm_options);
+    Y->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    inv_variance->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+#if (CUDNN_VERSION < 8906)
+    SKIP("RmsNorm is not supported in cudnn versions prior to 8.9.6");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("RMSNorm requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    REQUIRE(graph.warmup(handle).is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Var_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Y_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr}, {inv_variance, Var_tensor.devPtr}, {scale, Scale_tensor.devPtr}, {Y, Y_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("RmsNorm Inference", "[rmsnorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X     = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_data_type(fe::DataType_t::FLOAT)
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto bias  = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({1, hidden_size, 1, 1})
+                                 .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                 .set_data_type(fe::DataType_t::FLOAT));
+
+    float epsilon_cpu = 1e-05f;
+    auto epsilon      = graph.tensor(epsilon_cpu);
+
+    auto rmsnorm_options = fe::graph::Rmsnorm_attributes()
+                               .set_forward_phase(fe::NormFwdPhase_t::INFERENCE)
+                               .set_epsilon(epsilon)
+                               .set_bias(bias);
+    auto [Y, inv_variance] = graph.rmsnorm(X, scale, rmsnorm_options);
+    Y->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    REQUIRE(inv_variance == nullptr);
+
+#if (CUDNN_VERSION < 8906)
+    SKIP("RmsNorm is not supported in cudnn versions prior to 8.9.6");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("RmsNorm requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Bias_tensor(hidden_size, false);
+    Surface<float> Y_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr}, {scale, Scale_tensor.devPtr}, {bias, Bias_tensor.devPtr}, {Y, Y_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
+
+TEST_CASE("RmsNorm Backward", "[rmsnorm][graph]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+    graph.set_intermediate_data_type(fe::DataType_t::FLOAT).set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto batch_size  = 4;
+    auto seq_length  = 1024;
+    auto hidden_size = 128;
+
+    auto X  = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("X")
+                              .set_data_type(fe::DataType_t::FLOAT)
+                              .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                              .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+    auto DY = graph.tensor(fe::graph::Tensor_attributes()
+                               .set_name("DY")
+                               .set_data_type(fe::DataType_t::FLOAT)
+                               .set_dim({batch_size * seq_length, hidden_size, 1, 1})
+                               .set_stride({hidden_size, 1, hidden_size, hidden_size}));
+
+    auto scale        = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({1, hidden_size, 1, 1})
+                                  .set_stride({hidden_size, 1, hidden_size, hidden_size})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+    auto inv_variance = graph.tensor(fe::graph::Tensor_attributes()
+                                         .set_name("inv_variance")
+                                         .set_dim({batch_size * seq_length, 1, 1, 1})
+                                         .set_stride({1, 1, 1, 1})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+
+    auto DRMS_options        = fe::graph::Rmsnorm_backward_attributes().has_dbias(false);
+    auto [DX, dscale, dbias] = graph.rmsnorm_backward(DY, X, scale, inv_variance, DRMS_options);
+    DX->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    dscale->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    REQUIRE(dbias == nullptr);
+
+#if (CUDNN_VERSION < 8906)
+    SKIP("RmsNorm is not supported in cudnn versions prior to 8.9.6");
+#endif
+    if (check_device_arch_newer_than("ampere") == false) {
+        SKIP("RmsNorm Backward requires Ampere and up");
+    }
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A, fe::HeurMode_t::FALLBACK}).is_good());
+
+    REQUIRE(graph.check_support().is_good());
+
+    REQUIRE(graph.build_plans().is_good());
+
+    Surface<float> X_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> DY_tensor(batch_size * seq_length * hidden_size, false);
+    Surface<float> Mean_tensor(batch_size * seq_length, false);
+    Surface<float> Inv_variance_tensor(batch_size * seq_length, false);
+    Surface<float> Scale_tensor(hidden_size, false);
+    Surface<float> Dscale_tensor(hidden_size, false);
+    Surface<float> Dbias_tensor(hidden_size, false);
+    Surface<float> DX_tensor(batch_size * seq_length * hidden_size, false);
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_tensor.devPtr},
+        {DY, DY_tensor.devPtr},
+        {inv_variance, Inv_variance_tensor.devPtr},
+        {scale, Scale_tensor.devPtr},
+        {dscale, Dscale_tensor.devPtr},
+        {DX, DX_tensor.devPtr}};
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_benchmark.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_benchmark.cpp
new file mode 100644
index 00000000..883ad1a0
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_benchmark.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Benchmark sdpa graph API runtimes"
+
+This example is supposed to be used when executing full models and/or doing multiple iterations.
+*/
+
+// Directly use the forward graph builder from the toy example
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph(int64_t const b,
+                          int64_t const h_q,
+                          int64_t const h_k,
+                          int64_t const h_v,
+                          int64_t const s_q,
+                          int64_t const s_kv,
+                          int64_t const d_qk,
+                          int64_t const d_v,
+                          float const attn_scale    = 1.0f,
+                          bool const generate_stats = true,
+                          bool const causal_mask    = false,
+                          bool const padding_mask   = false);
+
+// Directly use the backward graph builder from the toy example
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_backward_graph(int64_t const b,
+                           int64_t const h_q,
+                           int64_t const h_k,
+                           int64_t const h_v,
+                           int64_t const s_q,
+                           int64_t const s_kv,
+                           int64_t const d_qk,
+                           int64_t const d_v,
+                           float const attn_scale    = 1.0f,
+                           bool const generate_stats = true,
+                           bool const causal_mask    = false,
+                           bool const alibi_mask     = false,
+                           bool const padding_mask   = false,
+                           bool has_attn_bias        = false);
+
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+
+#define DO_UID 101
+#define DQ_UID 102
+#define DK_UID 103
+#define DV_UID 104
+
+TEST_CASE("Benchmark sdpa graph API runtimes", "[graph][sdpa][flash]") {
+    SKIP("Very long test turned off by default.");
+
+    int64_t b    = 3;     // batch size
+    int64_t h_q  = 4;     // head dim
+    int64_t h_k  = 4;     // head dim
+    int64_t h_v  = 4;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk = 128;   // hidden dim
+    int64_t d_v  = 128;   // hidden dim
+
+    if (cudnnGetVersion() < 8903) {
+        SKIP("Test requires cudnn 8.9.3 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    BENCHMARK_ADVANCED("Create")(Catch::Benchmark::Chronometer meter) {
+        meter.measure([&] { auto g = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v); });
+    };
+
+    BENCHMARK_ADVANCED("Validate")(Catch::Benchmark::Chronometer meter) {
+        auto g = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+
+        meter.measure([&] { return g->validate(); });
+    };
+
+    BENCHMARK_ADVANCED("Build Backend Operation Graph")(Catch::Benchmark::Chronometer meter) {
+        auto g      = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+        auto status = g->validate();
+
+        meter.measure([&] { return g->build_operation_graph(handle); });
+    };
+
+    BENCHMARK_ADVANCED("Create Execution Plans")(Catch::Benchmark::Chronometer meter) {
+        auto g      = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+        auto status = g->validate();
+        status      = g->build_operation_graph(handle);
+
+        meter.measure([&] { return g->create_execution_plans({fe::HeurMode_t::A}); });
+    };
+
+    BENCHMARK_ADVANCED("Check Support")(Catch::Benchmark::Chronometer meter) {
+        auto g      = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+        auto status = g->validate();
+        status      = g->build_operation_graph(handle);
+        status      = g->create_execution_plans({fe::HeurMode_t::A});
+
+        meter.measure([&] { return g->check_support(); });
+    };
+
+    BENCHMARK_ADVANCED("Cached Build plan")(Catch::Benchmark::Chronometer meter) {
+        auto g      = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+        auto status = g->validate();
+        status      = g->build_operation_graph(handle);
+        status      = g->create_execution_plans({fe::HeurMode_t::A});
+        status      = g->check_support();
+        status      = g->build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE, false);
+
+        meter.measure([&] { return g->build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE, false); });
+    };
+
+    BENCHMARK_ADVANCED("Workspace query")(Catch::Benchmark::Chronometer meter) {
+        auto g      = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+        auto status = g->validate();
+        status      = g->build_operation_graph(handle);
+        status      = g->create_execution_plans({fe::HeurMode_t::A});
+        status      = g->check_support();
+        status      = g->build_plans(fe::BuildPlanPolicy_t::HEURISTICS_CHOICE, false);
+
+        meter.measure([&] { return g->get_workspace_size(); });
+    };
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd.cpp
new file mode 100644
index 00000000..1c4733d6
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd.cpp
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa backward"
+
+This example shows how to construct a sdpa backward graph->
+*/
+
+// Tensors in backward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define DBIAS_UID 7
+#define SEQ_LEN_Q_UID 8
+#define SEQ_LEN_KV_UID 9
+
+#define DO_UID 101
+#define DQ_UID 102
+#define DK_UID 103
+#define DV_UID 104
+
+// Function to create the SDPA (Scaled Dot-Product Attention) backward graph
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_backward_graph(int64_t const b,
+                           int64_t const h_q,
+                           int64_t const h_k,
+                           int64_t const h_v,
+                           int64_t const s_q,
+                           int64_t const s_kv,
+                           int64_t const d_qk,
+                           int64_t const d_v,
+                           float const attn_scale                     = 1.0f,
+                           [[maybe_unused]] bool const generate_stats = true,
+                           bool const causal_mask                     = false,
+                           bool const alibi_mask                      = false,
+                           bool const padding_mask                    = false,
+                           bool has_attn_bias                         = false,
+                           bool is_deterministic                      = false) {
+    // Create a graph and set common global properties
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    // Define input tensors Q, K, V
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    // Define output tensor O
+    auto O = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("O")
+                               .set_uid(O_UID)
+                               .set_dim({b, h_q, s_q, d_v})
+                               .set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}));
+
+    // Define gradient tensor dO
+    auto dO = graph->tensor(fe::graph::Tensor_attributes()
+                                .set_name("dO")
+                                .set_uid(DO_UID)
+                                .set_dim({b, h_q, s_q, d_v})
+                                .set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}));
+
+    // Define stats tensor
+    auto Stats = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Stats")
+                                   .set_uid(STATS_UID)
+                                   .set_dim({b, h_q, s_q, 1})
+                                   .set_stride({h_q * s_q, s_q, 1, 1})
+                                   .set_data_type(fe::DataType_t::FLOAT));
+
+    // Set SDPA backward options
+    auto sdpa_options = fe::graph::SDPA_backward_attributes()
+                            .set_name("flash_attention_backward")
+                            .set_alibi_mask(alibi_mask)
+                            .set_attn_scale(attn_scale)
+                            .set_deterministic_algorithm(is_deterministic);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    // If attention bias is provided, set it
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+
+        auto dbias = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("dbias")
+                                       .set_uid(DBIAS_UID)
+                                       .set_dim({1, h_q, s_q, s_kv})
+                                       .set_stride({s_q * s_kv * h_q, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_dbias(dbias);
+    }
+
+    // If padding mask is enabled, set sequence lengths
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    // Compute SDPA backward and get gradients dQ, dK, dV
+    auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, Stats, sdpa_options);
+
+    // Set output tensors dQ, dK, dV
+    dQ->set_output(true)
+        .set_uid(DQ_UID)
+        .set_dim({b, h_q, s_q, d_qk})
+        .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1});
+    dK->set_output(true)
+        .set_uid(DK_UID)
+        .set_dim({b, h_k, s_kv, d_qk})
+        .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1});
+    dV->set_output(true)
+        .set_uid(DV_UID)
+        .set_dim({b, h_v, s_kv, d_v})
+        .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1});
+
+    return graph;
+}
+
+// Test case for the SDPA backward graph
+TEST_CASE("Toy sdpa backward", "[graph][sdpa][flash][backward]") {
+    int64_t b             = 3;     // batch size
+    int64_t h_q           = 4;     // head dim
+    int64_t h_k           = 4;     // head dim
+    int64_t h_v           = 4;     // head dim
+    int64_t s_q           = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv          = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk          = 128;   // hidden dim
+    int64_t d_v           = 128;   // hidden dim
+    bool generate_stats   = true;
+    float attn_scale      = 0.123f;
+    bool causal_mask      = true;
+    bool padding_mask     = (cudnnGetVersion() >= 8903);
+    bool alibi_mask       = (cudnnGetVersion() >= 8904);
+    bool has_attn_bias    = (cudnnGetVersion() >= 90500);
+    bool is_deterministic = true;
+
+    if (is_deterministic) {
+        // switching off because NOT SUPPORTED in deterministic algorithm
+        alibi_mask    = false;
+        has_attn_bias = false;
+    }
+
+    if (cudnnGetVersion() < 8903) {
+        SKIP("Test requires cudnn 8.9.3 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Create the SDPA backward graph
+    auto graph = create_sdpa_backward_graph(b,
+                                            h_q,
+                                            h_k,
+                                            h_v,
+                                            s_q,
+                                            s_kv,
+                                            d_qk,
+                                            d_v,
+                                            attn_scale,
+                                            generate_stats,
+                                            causal_mask,
+                                            alibi_mask,
+                                            padding_mask,
+                                            has_attn_bias,
+                                            is_deterministic);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    //// Build variant pack
+    // inputs
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+    Surface<half> o_tensor(b * h_q * s_q * d_v, false);
+    Surface<half> dO_tensor(b * h_q * s_q * d_v, false);
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+    // outputs
+    Surface<half> dQ_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> dK_tensor(b * h_k * s_kv * d_qk, false);
+    Surface<half> dV_tensor(b * h_v * s_kv * d_v, false);
+
+    Surface<half> bias_tensor(1 * h_q * s_q * s_kv, false);
+    Surface<half> dbias_tensor(1 * h_q * s_q * s_kv, false);
+
+    // Create variant pack with input and output tensors
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {// inputs
+                                                                                   {Q_UID, q_tensor.devPtr},
+                                                                                   {K_UID, k_tensor.devPtr},
+                                                                                   {V_UID, v_tensor.devPtr},
+                                                                                   {O_UID, o_tensor.devPtr},
+                                                                                   {DO_UID, dO_tensor.devPtr},
+                                                                                   {STATS_UID, stats_tensor.devPtr},
+                                                                                   // outputs
+                                                                                   {DQ_UID, dQ_tensor.devPtr},
+                                                                                   {DK_UID, dK_tensor.devPtr},
+                                                                                   {DV_UID, dV_tensor.devPtr}};
+
+    // If attention bias is provided, add it to the variant pack
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID]  = bias_tensor.devPtr;
+        variant_pack[DBIAS_UID] = dbias_tensor.devPtr;
+    }
+
+    // If padding mask is enabled, add sequence lengths to the variant pack
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    // Allocate workspace
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_cudagraphs.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_cudagraphs.cpp
new file mode 100644
index 00000000..6b7e922c
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_cudagraphs.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF  MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../utils/helpers.h"
+#include <catch2/catch_test_macros.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa backward as CUDA graph"
+
+This example shows how to construct a sdpa backward graph
+as a CUDA graph, then instantiate and execute it in a simple way.
+
+For an example showing how to construct the CUDA graph as a
+child of a larger CUDA graph, see ../misc/cudagraphs.cpp.
+*/
+
+// Tensors in backward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define DBIAS_UID 7
+#define SEQ_LEN_Q_UID 8
+#define SEQ_LEN_KV_UID 9
+
+#define DO_UID 101
+#define DQ_UID 102
+#define DK_UID 103
+#define DV_UID 104
+
+// Declare the function from fp16_bwd.cpp
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_backward_graph(int64_t const b,
+                           int64_t const h_q,
+                           int64_t const h_k,
+                           int64_t const h_v,
+                           int64_t const s_q,
+                           int64_t const s_kv,
+                           int64_t const d_qk,
+                           int64_t const d_v,
+                           float const attn_scale                     = 1.0f,
+                           [[maybe_unused]] bool const generate_stats = true,
+                           bool const causal_mask                     = false,
+                           bool const alibi_mask                      = false,
+                           bool const padding_mask                    = false,
+                           bool has_attn_bias                         = false,
+                           bool is_deterministic                      = false);
+
+// Convenience class to encapsulate SDPA test data for this example
+class SdpaBwdTestData {
+   public:
+    SdpaBwdTestData(int64_t const b,
+                    int64_t const h_q,
+                    int64_t const h_k,
+                    int64_t const h_v,
+                    int64_t const s_q,
+                    int64_t const s_kv,
+                    int64_t const d_qk,
+                    int64_t const d_v,
+                    int64_t const workspace_size,
+                    bool const padding_mask,
+                    bool const has_attn_bias,
+                    float const qkv_fill_value)
+        : q_tensor(b * h_q * s_q * d_qk, false, cpu_float2half_rn(qkv_fill_value)),
+          k_tensor(b * h_k * d_qk * s_kv, false, cpu_float2half_rn(qkv_fill_value)),
+          v_tensor(b * h_v * d_v * s_kv, false, cpu_float2half_rn(qkv_fill_value)),
+          o_tensor(b * s_q * h_q * d_qk, false),
+          bias_tensor(b * 1 * s_q * s_kv, false, cpu_float2half_rn(qkv_fill_value)),
+          dq_tensor(b * h_q * s_q * d_qk, false),
+          dk_tensor(b * h_k * d_qk * s_kv, false),
+          dv_tensor(b * h_v * d_v * s_kv, false),
+          do_tensor(b * s_q * h_q * d_qk, false, cpu_float2half_rn(qkv_fill_value)),
+          dbias_tensor(1 * h_q * s_q * s_kv, false),
+          devActualSeqlenQ(b, false, /*fillValue=*/20),
+          devActualSeqlenKV(b, false, /*fillValue=*/20),
+          statsTensor(b * h_q * s_q * 1, false),
+          workspace(workspace_size, false),
+          padding_mask_(padding_mask),
+          has_attn_bias_(has_attn_bias) {}
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void *>
+    build_variant_pack() {
+        std::unordered_map<fe::graph::Tensor_attributes::uid_t, void *> variant_pack;
+        variant_pack[Q_UID] = q_tensor.devPtr;
+        variant_pack[K_UID] = k_tensor.devPtr;
+        variant_pack[V_UID] = v_tensor.devPtr;
+        variant_pack[O_UID] = o_tensor.devPtr;
+
+        variant_pack[DQ_UID] = dq_tensor.devPtr;
+        variant_pack[DK_UID] = dk_tensor.devPtr;
+        variant_pack[DV_UID] = dv_tensor.devPtr;
+        variant_pack[DO_UID] = do_tensor.devPtr;
+
+        variant_pack[STATS_UID] = statsTensor.devPtr;
+
+        if (has_attn_bias_) {
+            variant_pack[BIAS_UID]  = bias_tensor.devPtr;
+            variant_pack[DBIAS_UID] = dbias_tensor.devPtr;
+        }
+        if (padding_mask_) {
+            variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+            variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+        }
+
+        return variant_pack;
+    }
+
+    void *
+    get_workspace_ptr() {
+        return workspace.devPtr;
+    }
+
+    void
+    sync_outputs() {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaMemcpy(dq_tensor.hostPtr,
+                              dq_tensor.devPtr,
+                              sizeof(dq_tensor.hostPtr[0]) * dq_tensor.n_elems,
+                              cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpy(dk_tensor.hostPtr,
+                              dk_tensor.devPtr,
+                              sizeof(dk_tensor.hostPtr[0]) * dk_tensor.n_elems,
+                              cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpy(dv_tensor.hostPtr,
+                              dv_tensor.devPtr,
+                              sizeof(dv_tensor.hostPtr[0]) * dv_tensor.n_elems,
+                              cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    template <typename T>
+    bool
+    equal_tensors(Surface<T> &a, Surface<T> &b) {
+        REQUIRE(a.n_elems == b.n_elems);
+        for (int i = 0; i < a.n_elems; i++) {
+            if (a.hostPtr[i] != b.hostPtr[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool
+    equal_outputs(SdpaBwdTestData &other) {
+        sync_outputs();
+        other.sync_outputs();
+        if (!equal_tensors(dq_tensor, other.dq_tensor)) return false;
+        if (!equal_tensors(dk_tensor, other.dk_tensor)) return false;
+        if (!equal_tensors(dv_tensor, other.dv_tensor)) return false;
+        return true;
+    }
+
+   private:
+    Surface<half> q_tensor;
+    Surface<half> k_tensor;
+    Surface<half> v_tensor;
+    Surface<half> o_tensor;
+    Surface<half> bias_tensor;
+
+    Surface<half> dq_tensor;
+    Surface<half> dk_tensor;
+    Surface<half> dv_tensor;
+    Surface<half> do_tensor;
+    Surface<half> dbias_tensor;
+
+    Surface<int32_t> devActualSeqlenQ;
+    Surface<int32_t> devActualSeqlenKV;
+    Surface<float> statsTensor;
+    Surface<int8_t> workspace;
+    bool padding_mask_;
+    bool has_attn_bias_;
+};
+
+TEST_CASE("Toy sdpa backward as CUDA graph", "[graph][sdpa][flash][backward][cudagraph]") {
+    // cuDNN only supports native CUDA graphs in CUDA 12.0 and above.
+    // Because the below test depends on some CUDA graph APIs that changed
+    // between CUDA 11.x and 12.0, it wouldn't even compile in <12.0 anyway,
+    // so we just disable the whole test by #if in that case.
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires cuda toolkit 12.0 or above");
+#else
+    // Also check the CUDA version at runtime, for good measure.
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    // cuDNN only supports native CUDA graphs for sdpa in 9.6 or above.
+    if (cudnnGetVersion() < 90600) {
+        SKIP("Test requires cudnn 9.6.0 or above");
+        return;
+    }
+
+    int64_t b          = 3;     // batch size
+    int64_t h_q        = 4;     // head dim
+    int64_t h_k        = 4;     // head dim
+    int64_t h_v        = 4;     // head dim
+    int64_t s_q        = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv       = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk       = 128;   // hidden dim
+    int64_t d_v        = 128;   // hidden dim
+    float attn_scale   = 0.123f;
+    bool causal_mask   = true;
+    bool padding_mask  = (cudnnGetVersion() >= 8903);
+    bool alibi_mask    = false;  // TODO: (cudnnGetVersion() >= 8904)
+    bool has_attn_bias = (cudnnGetVersion() >= 90500);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Create the SDPA backward graph
+    auto graph = create_sdpa_backward_graph(b,
+                                            h_q,
+                                            h_k,
+                                            h_v,
+                                            s_q,
+                                            s_kv,
+                                            d_qk,
+                                            d_v,
+                                            attn_scale,
+                                            false,
+                                            causal_mask,
+                                            alibi_mask,
+                                            padding_mask,
+                                            has_attn_bias);
+
+    // Validate the graph and lower the FE graph to BE graph
+    REQUIRE(graph->validate().is_good());
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+    // Make sure the selected execution plan supports cuda graph
+    graph->select_behavior_notes({cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
+    REQUIRE(graph->check_support().is_good());
+    REQUIRE(graph->build_plans().is_good());
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+
+    //// Create a CUDA graph.
+    // The responsibility to destroy is on the user.
+    cudaGraph_t cudnn_cuda_graph;
+    CUDA_CHECK(cudaGraphCreate(&cudnn_cuda_graph, 0));  // 0 is just what the API says to pass
+
+    // Create the first variant pack.
+    SdpaBwdTestData test_data_1(b,
+                                h_q,
+                                h_k,
+                                h_v,
+                                s_q,
+                                s_kv,
+                                d_qk,
+                                d_v,
+                                workspace_size,
+                                padding_mask,
+                                has_attn_bias,
+                                /*fillValue_qkv=*/1.1f);
+    auto variant_pack_1 = test_data_1.build_variant_pack();
+
+    // Populate and instantiate the graph, then launch it.
+    REQUIRE(graph->populate_cuda_graph(handle, variant_pack_1, test_data_1.get_workspace_ptr(), cudnn_cuda_graph)
+                .is_good());
+    cudaGraphExec_t cuda_graph_exec;
+    CUDA_CHECK(cudaGraphInstantiate(&cuda_graph_exec, cudnn_cuda_graph, NULL, NULL, 0));
+    CUDA_CHECK(cudaGraphLaunch(cuda_graph_exec, 0));
+
+    // Functional correctness:
+    // Execute the SDPA directly and check that the results are the same as using a CUDA graph.
+    SdpaBwdTestData test_data_2(test_data_1);
+    auto variant_pack_2 = test_data_2.build_variant_pack();
+    REQUIRE(graph->execute(handle, variant_pack_2, test_data_2.get_workspace_ptr()).is_good());
+    REQUIRE(test_data_1.equal_outputs(test_data_2));
+
+    // Update the existing CUDA graph with different data.
+    SdpaBwdTestData test_data_3(b,
+                                h_q,
+                                h_k,
+                                h_v,
+                                s_q,
+                                s_kv,
+                                d_qk,
+                                d_v,
+                                workspace_size,
+                                padding_mask,
+                                has_attn_bias,
+                                /*fillValue_qkv=*/1.3f);
+    auto variant_pack_3 = test_data_3.build_variant_pack();
+    REQUIRE(
+        graph->update_cuda_graph(handle, variant_pack_3, test_data_3.get_workspace_ptr(), cudnn_cuda_graph).is_good());
+    cudaGraphExecUpdateResultInfo result_info;
+    CUDA_CHECK(cudaGraphExecUpdate(cuda_graph_exec, cudnn_cuda_graph, &result_info));
+    CUDA_CHECK(cudaGraphLaunch(cuda_graph_exec, 0));
+
+    // Functional correctness:
+    // Execute the SDPA directly and check that the results are the same as using a CUDA graph.
+    SdpaBwdTestData test_data_4(test_data_3);
+    auto variant_pack_4 = test_data_4.build_variant_pack();
+    REQUIRE(graph->execute(handle, variant_pack_4, test_data_4.get_workspace_ptr()).is_good());
+    REQUIRE(test_data_3.equal_outputs(test_data_4));
+
+    // Because original and updated graph have different inputs, their outputs should *not* match
+    REQUIRE(!test_data_1.equal_outputs(test_data_3));
+
+    //// Cleanup
+    CUDA_CHECK(cudaGraphExecDestroy(cuda_graph_exec));
+    CUDA_CHECK(cudaGraphDestroy(cudnn_cuda_graph));
+#endif  // CUDART_VERSION < 12000
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp
new file mode 100644
index 00000000..87c761ef
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <tuple>
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+// Tensors in backward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define DBIAS_UID 7
+#define SEQ_LEN_Q_UID 8
+#define SEQ_LEN_KV_UID 9
+
+#define DO_UID 101
+#define DQ_UID 102
+#define DK_UID 103
+#define DV_UID 104
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_backward_graph(int64_t const b,
+                           int64_t const h_q,
+                           int64_t const h_k,
+                           int64_t const h_v,
+                           int64_t const s_q,
+                           int64_t const s_kv,
+                           int64_t const d_qk,
+                           int64_t const d_v,
+                           float const attn_scale = 1.0f) {
+    // Create a graph and set common global properties
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    // Define input tensors Q, K, V
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    // Define output tensor O
+    auto O = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("O")
+                               .set_uid(O_UID)
+                               .set_dim({b, h_q, s_q, d_v})
+                               .set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}));
+
+    // Define gradient tensor dO
+    auto dO = graph->tensor(fe::graph::Tensor_attributes()
+                                .set_name("dO")
+                                .set_uid(DO_UID)
+                                .set_dim({b, h_q, s_q, d_v})
+                                .set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}));
+
+    auto soft_cap_scalar = graph->tensor(0.8f);
+
+    // Define stats tensor
+    auto Stats = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Stats")
+                                   .set_uid(STATS_UID)
+                                   .set_dim({b, h_q, s_q, 1})
+                                   .set_stride({h_q * s_q, s_q, 1, 1})
+                                   .set_data_type(fe::DataType_t::FLOAT));
+
+    auto softcap = std::make_shared<fe::graph::attn::score_modifiers::Softcap>();
+    // Set SDPA backward options
+    auto sdpa_options = fe::graph::SDPA_backward_attributes()
+                            .set_name("flash_attention_backward")
+                            .set_attn_scale(attn_scale)
+                            .set_score_mod(std::bind(&fe::graph::attn::score_modifiers::Softcap::forward,
+                                                     softcap,
+                                                     std::placeholders::_1,
+                                                     std::placeholders::_2,
+                                                     soft_cap_scalar))
+                            .set_score_mod_bprop(std::bind(&fe::graph::attn::score_modifiers::Softcap::backward,
+                                                           softcap,
+                                                           std::placeholders::_1,
+                                                           std::placeholders::_2,
+                                                           soft_cap_scalar));
+
+    // Compute SDPA backward and get gradients dQ, dK, dV
+    auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, Stats, sdpa_options);
+
+    // Set output tensors dQ, dK, dV
+    dQ->set_output(true)
+        .set_uid(DQ_UID)
+        .set_dim({b, h_q, s_q, d_qk})
+        .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1});
+    dK->set_output(true)
+        .set_uid(DK_UID)
+        .set_dim({b, h_k, s_kv, d_qk})
+        .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1});
+    dV->set_output(true)
+        .set_uid(DV_UID)
+        .set_dim({b, h_v, s_kv, d_v})
+        .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1});
+
+    return graph;
+}
+
+// Test case for the SDPA backward graph
+TEST_CASE("Toy sdpa backward with flexible graph", "[graph][sdpa][flash][backward][flex_attention]") {
+    int64_t b        = 3;     // batch size
+    int64_t h_q      = 4;     // head dim
+    int64_t h_k      = 4;     // head dim
+    int64_t h_v      = 4;     // head dim
+    int64_t s_q      = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv     = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk     = 128;   // hidden dim
+    int64_t d_v      = 128;   // hidden dim
+    float attn_scale = 0.123f;
+
+    if (cudnnGetVersion() < 90400) {
+        SKIP("Test requires cudnn 9.4.0 or above");
+        return;
+    }
+
+    if (check_device_arch_newer_than("hopper") == false) {
+        SKIP("Test requires Hopper or above");
+        return;
+    }
+
+    if (get_compute_capability() == 120) {
+        SKIP("Backward pass with flexible graphs is not supported on SM version 120 yet");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Create the SDPA backward graph
+    auto graph = create_sdpa_backward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v, attn_scale);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    //// Build variant pack
+    // inputs
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+    Surface<half> o_tensor(b * h_q * s_q * d_v, false);
+    Surface<half> dO_tensor(b * h_q * s_q * d_v, false);
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+    // outputs
+    Surface<half> dQ_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> dK_tensor(b * h_k * s_kv * d_qk, false);
+    Surface<half> dV_tensor(b * h_v * s_kv * d_v, false);
+
+    // Create variant pack with input and output tensors
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {// inputs
+                                                                                   {Q_UID, q_tensor.devPtr},
+                                                                                   {K_UID, k_tensor.devPtr},
+                                                                                   {V_UID, v_tensor.devPtr},
+                                                                                   {O_UID, o_tensor.devPtr},
+                                                                                   {DO_UID, dO_tensor.devPtr},
+                                                                                   {STATS_UID, stats_tensor.devPtr},
+                                                                                   // outputs
+                                                                                   {DQ_UID, dQ_tensor.devPtr},
+                                                                                   {DK_UID, dK_tensor.devPtr},
+                                                                                   {DV_UID, dV_tensor.devPtr}};
+
+    // Allocate workspace
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_sink_token.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_sink_token.cpp
new file mode 100644
index 00000000..dba53a1c
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_bwd_with_sink_token.cpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa backward with sink"
+
+This example shows how to construct a sdpa backward graph with sink token.
+*/
+
+// Tensors in backward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define DBIAS_UID 7
+#define SEQ_LEN_Q_UID 8
+#define SEQ_LEN_KV_UID 9
+#define SINK_TOKEN_UID 10
+
+#define DO_UID 101
+#define DQ_UID 102
+#define DK_UID 103
+#define DV_UID 104
+#define DSINK_TOKEN_UID 105
+
+// Function to create the SDPA (Scaled Dot-Product Attention) backward graph
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_backward_graph_with_sink(int64_t const b,
+                                     int64_t const h_q,
+                                     int64_t const h_k,
+                                     int64_t const h_v,
+                                     int64_t const s_q,
+                                     int64_t const s_kv,
+                                     int64_t const d_qk,
+                                     int64_t const d_v,
+                                     float const attn_scale                     = 1.0f,
+                                     [[maybe_unused]] bool const generate_stats = true,
+                                     bool const causal_mask                     = false,
+                                     bool const alibi_mask                      = false,
+                                     bool const padding_mask                    = false,
+                                     bool has_attn_bias                         = false,
+                                     bool has_sink_token                        = false) {
+    // Create a graph and set common global properties
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    // Define input tensors Q, K, V
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    // Define output tensor O
+    auto O = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("O")
+                               .set_uid(O_UID)
+                               .set_dim({b, h_q, s_q, d_v})
+                               .set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}));
+
+    // Define gradient tensor dO
+    auto dO = graph->tensor(fe::graph::Tensor_attributes()
+                                .set_name("dO")
+                                .set_uid(DO_UID)
+                                .set_dim({b, h_q, s_q, d_v})
+                                .set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}));
+
+    // Define stats tensor
+    auto Stats = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Stats")
+                                   .set_uid(STATS_UID)
+                                   .set_dim({b, h_q, s_q, 1})
+                                   .set_stride({h_q * s_q, s_q, 1, 1})
+                                   .set_data_type(fe::DataType_t::FLOAT));
+
+    // Set SDPA backward options
+    auto sdpa_options = fe::graph::SDPA_backward_attributes()
+                            .set_name("flash_attention_backward")
+                            .set_alibi_mask(alibi_mask)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    // If attention bias is provided, set it
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+
+        auto dbias = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("dbias")
+                                       .set_uid(DBIAS_UID)
+                                       .set_dim({1, h_q, s_q, s_kv})
+                                       .set_stride({s_q * s_kv * h_q, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_dbias(dbias);
+    }
+
+    // If padding mask is enabled, set sequence lengths
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    if (has_sink_token) {
+        auto sink_token = graph->tensor(fe::graph::Tensor_attributes()
+                                            .set_name("sink_token")
+                                            .set_uid(SINK_TOKEN_UID)
+                                            .set_dim({1, h_q, 1, 1})
+                                            .set_stride({h_q, 1, 1, 1})
+                                            .set_data_type(fe::DataType_t::FLOAT));
+        sdpa_options.set_sink_token(sink_token);
+
+        auto dsink_token = graph->tensor(fe::graph::Tensor_attributes()
+                                             .set_name("dsink_token")
+                                             .set_uid(DSINK_TOKEN_UID)
+                                             .set_dim({1, h_q, 1, 1})
+                                             .set_stride({h_q, 1, 1, 1})
+                                             .set_data_type(fe::DataType_t::FLOAT));
+        sdpa_options.set_dsink_token(dsink_token);
+    }
+
+    // Compute SDPA backward and get gradients dQ, dK, dV
+    auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, Stats, sdpa_options);
+
+    // Set output tensors dQ, dK, dV
+    dQ->set_output(true)
+        .set_uid(DQ_UID)
+        .set_dim({b, h_q, s_q, d_qk})
+        .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1});
+    dK->set_output(true)
+        .set_uid(DK_UID)
+        .set_dim({b, h_k, s_kv, d_qk})
+        .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1});
+    dV->set_output(true)
+        .set_uid(DV_UID)
+        .set_dim({b, h_v, s_kv, d_v})
+        .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1});
+
+    return graph;
+}
+
+// Test case for the SDPA backward graph
+TEST_CASE("Toy sdpa backward with sink", "[graph][sdpa][flash][backward]") {
+    int64_t b           = 3;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = true;
+    bool padding_mask   = (cudnnGetVersion() >= 8903);
+    bool alibi_mask     = (cudnnGetVersion() >= 8904);
+    bool has_attn_bias  = (cudnnGetVersion() >= 90500);
+    bool has_sink_token = (cudnnGetVersion() >= 91300);
+    if (cudnnGetVersion() < 91300) {
+        SKIP("Test requires cudnn 9.1.3 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Create the SDPA backward graph
+    auto graph = create_sdpa_backward_graph_with_sink(b,
+                                                      h_q,
+                                                      h_k,
+                                                      h_v,
+                                                      s_q,
+                                                      s_kv,
+                                                      d_qk,
+                                                      d_v,
+                                                      attn_scale,
+                                                      generate_stats,
+                                                      causal_mask,
+                                                      alibi_mask,
+                                                      padding_mask,
+                                                      has_attn_bias,
+                                                      has_sink_token);
+
+    // Supported starting 9.13+
+    auto status = graph->validate();
+    if ((cudnnGetVersion() >= 91300)) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(graph->check_support(handle).is_good());
+    REQUIRE(graph->build_plans(handle).is_good());
+
+    //// Build variant pack
+    // inputs
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+    Surface<half> o_tensor(b * h_q * s_q * d_v, false);
+    Surface<half> dO_tensor(b * h_q * s_q * d_v, false);
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+    // outputs
+    Surface<half> dQ_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> dK_tensor(b * h_k * s_kv * d_qk, false);
+    Surface<half> dV_tensor(b * h_v * s_kv * d_v, false);
+
+    Surface<half> bias_tensor(1 * h_q * s_q * s_kv, false);
+    Surface<half> dbias_tensor(1 * h_q * s_q * s_kv, false);
+
+    // Create variant pack with input and output tensors
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {// inputs
+                                                                                   {Q_UID, q_tensor.devPtr},
+                                                                                   {K_UID, k_tensor.devPtr},
+                                                                                   {V_UID, v_tensor.devPtr},
+                                                                                   {O_UID, o_tensor.devPtr},
+                                                                                   {DO_UID, dO_tensor.devPtr},
+                                                                                   {STATS_UID, stats_tensor.devPtr},
+                                                                                   // outputs
+                                                                                   {DQ_UID, dQ_tensor.devPtr},
+                                                                                   {DK_UID, dK_tensor.devPtr},
+                                                                                   {DV_UID, dV_tensor.devPtr}};
+
+    // If attention bias is provided, add it to the variant pack
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID]  = bias_tensor.devPtr;
+        variant_pack[DBIAS_UID] = dbias_tensor.devPtr;
+    }
+
+    // If padding mask is enabled, add sequence lengths to the variant pack
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    // If sink token is enabled, add it to the variant pack
+    Surface<float> sink_token_tensor(1 * h_q * 1 * 1, false);
+    Surface<float> dsink_token_tensor(1 * h_q * 1 * 1, false);
+    if (has_sink_token) {
+        variant_pack[SINK_TOKEN_UID]  = sink_token_tensor.devPtr;
+        variant_pack[DSINK_TOKEN_UID] = dsink_token_tensor.devPtr;
+    }
+
+    // Allocate workspace
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_cached.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_cached.cpp
new file mode 100644
index 00000000..11511949
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_cached.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Cached sdpa"
+
+This example is supposed to be used when executing full models and/or doing multiple iterations.
+*/
+
+// Directly use the forward graph builder from the toy example
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph(int64_t const b,
+                          int64_t const h_q,
+                          int64_t const h_k,
+                          int64_t const h_v,
+                          int64_t const s_q,
+                          int64_t const s_kv,
+                          int64_t const d_qk,
+                          int64_t const d_v,
+                          float const attn_scale    = 1.0f,
+                          bool const generate_stats = true,
+                          bool const causal_mask    = false,
+                          bool const padding_mask   = false);
+
+// Directly use the backward graph builder from the toy example
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_backward_graph(int64_t const b,
+                           int64_t const h_q,
+                           int64_t const h_k,
+                           int64_t const h_v,
+                           int64_t const s_q,
+                           int64_t const s_kv,
+                           int64_t const d_qk,
+                           int64_t const d_v,
+                           float const attn_scale    = 1.0f,
+                           bool const generate_stats = true,
+                           bool const causal_mask    = false,
+                           bool const alibi_mask     = false,
+                           bool const padding_mask   = false,
+                           bool has_attn_bias        = false,
+                           bool is_deterministic     = false);
+
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+
+#define DO_UID 101
+#define DQ_UID 102
+#define DK_UID 103
+#define DV_UID 104
+
+using cache_t = std::unordered_map<std::size_t, std::shared_ptr<fe::graph::Graph>>;
+cache_t user_maintained_cache;
+
+bool
+cache_lookup_pre_built_graph(std::shared_ptr<fe::graph::Graph>& graph, cudnnHandle_t handle) {
+    auto cache_key = graph->key();
+    if (auto it = user_maintained_cache.find(cache_key); it != user_maintained_cache.end()) {
+        graph = it->second;
+        return true;
+    }
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+    user_maintained_cache.emplace(cache_key, graph);
+    return false;
+}
+
+TEST_CASE("Cached sdpa", "[graph][sdpa][flash]") {
+    int64_t b    = 3;     // batch size
+    int64_t h_q  = 4;     // head dim
+    int64_t h_k  = 4;     // head dim
+    int64_t h_v  = 4;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk = 128;   // hidden dim
+    int64_t d_v  = 128;   // hidden dim
+
+    if (cudnnGetVersion() < 8903) {
+        SKIP("Test requires cudnn 8.9.3 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto fwd_graph = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+    auto bwd_graph = create_sdpa_backward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+
+    // Wont get a cache hit the first time
+    REQUIRE(cache_lookup_pre_built_graph(fwd_graph, handle) == false);
+    REQUIRE(cache_lookup_pre_built_graph(bwd_graph, handle) == false);
+
+    auto fwd_graph2 = create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+    auto bwd_graph2 = create_sdpa_backward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v);
+
+    REQUIRE(cache_lookup_pre_built_graph(fwd_graph2, handle) == true);
+    REQUIRE(cache_lookup_pre_built_graph(bwd_graph2, handle) == true);
+
+    //// Build variant pack
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack;
+    // inputs
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * h_q * s_q * d_qk, false);
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+
+    variant_pack = {{Q_UID, q_tensor.devPtr},
+                    {K_UID, k_tensor.devPtr},
+                    {V_UID, v_tensor.devPtr},
+                    {O_UID, o_tensor.devPtr},
+                    {STATS_UID, stats_tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(fwd_graph2->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> fwd_workspace(workspace_size, false);
+
+    REQUIRE(fwd_graph2->execute(handle, variant_pack, fwd_workspace.devPtr).is_good());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    Surface<half> dO_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> dQ_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> dK_tensor(b * h_k * s_kv * d_qk, false);
+    Surface<half> dV_tensor(b * h_v * s_kv * d_v, false);
+
+    variant_pack = {// inputs
+                    {Q_UID, q_tensor.devPtr},
+                    {K_UID, k_tensor.devPtr},
+                    {V_UID, v_tensor.devPtr},
+                    {O_UID, o_tensor.devPtr},
+                    {DO_UID, dO_tensor.devPtr},
+                    {STATS_UID, stats_tensor.devPtr},
+                    // outputs
+                    {DQ_UID, dQ_tensor.devPtr},
+                    {DK_UID, dK_tensor.devPtr},
+                    {DV_UID, dV_tensor.devPtr}};
+
+    REQUIRE(bwd_graph2->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> bwd_workspace(workspace_size, false);
+
+    REQUIRE(bwd_graph2->execute(handle, variant_pack, bwd_workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_dynamic_shapes.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_dynamic_shapes.cpp
new file mode 100644
index 00000000..51d0cb43
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_dynamic_shapes.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward"
+
+This example shows how to construct a sdpa forward graph.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+
+static std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph(int64_t const b,
+                          int64_t const h_q,
+                          int64_t const h_k,
+                          int64_t const h_v,
+                          int64_t const s_q,
+                          int64_t const s_kv,
+                          int64_t const d_qk,
+                          int64_t const d_v,
+                          float const attn_scale    = 1.0f,
+                          bool const generate_stats = true,
+                          bool const causal_mask    = false,
+                          bool const padding_mask   = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT)
+        .set_dynamic_shape_enabled(true);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with dynamic shapes", "[graph][sdpa][flash][forward]") {
+    int64_t b           = 2;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = true;
+    bool padding_mask   = true;
+
+#if (CUDNN_VERSION < 91900)
+    SKIP("Test is disabled till backend is updated");
+#endif
+
+    std::cout << "Running size: {" << b << ", " << h_q << ", " << h_k << ", " << h_v << ", " << s_q << ", " << s_kv
+              << ", " << d_qk << ", " << d_v << "}" << std::endl;
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph(
+        b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v, attn_scale, generate_stats, causal_mask, padding_mask);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    Surface<float> statsTensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = statsTensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    workspace_size = 256 * 1024;
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    // Override shapes
+
+    int64_t override_b = 4;
+    Surface<half> q_tensor_2(override_b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor_2(override_b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor_2(override_b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor_2(override_b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack_2 = {
+        {Q_UID, q_tensor_2.devPtr}, {K_UID, k_tensor_2.devPtr}, {V_UID, v_tensor_2.devPtr}, {O_UID, o_tensor_2.devPtr}};
+
+    Surface<int32_t> devActualSeqlenQ_2(override_b, false);
+    Surface<int32_t> devActualSeqlenKV_2(override_b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(override_b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(override_b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ_2.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * override_b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV_2.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * override_b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack_2[SEQ_LEN_Q_UID]  = devActualSeqlenQ_2.devPtr;
+        variant_pack_2[SEQ_LEN_KV_UID] = devActualSeqlenKV_2.devPtr;
+    }
+
+    Surface<float> statsTensor_2(override_b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack_2[STATS_UID] = statsTensor_2.devPtr;
+    }
+
+    std::cout << "Running size: {" << override_b << ", " << h_q << ", " << h_k << ", " << h_v << ", " << s_q << ", "
+              << s_kv << ", " << d_qk << ", " << d_v << "}" << std::endl;
+
+    std::vector<int64_t> override_uids = {Q_UID, K_UID, V_UID, O_UID, SEQ_LEN_Q_UID, SEQ_LEN_KV_UID, STATS_UID};
+    std::vector<std::vector<int64_t>> override_shapes  = {{override_b, h_q, s_q, d_qk},
+                                                          {override_b, h_k, s_kv, d_qk},
+                                                          {override_b, h_v, s_kv, d_v},
+                                                          {override_b, s_q, h_q, d_v},
+                                                          {override_b, 1, 1, 1},
+                                                          {override_b, 1, 1, 1},
+                                                          {override_b, h_q * s_q * 1, 1, 1}};
+    std::vector<std::vector<int64_t>> override_strides = {{h_q * s_q * d_qk, s_q * d_qk, d_qk, 1},
+                                                          {h_k * d_qk * s_kv, d_qk * s_kv, s_kv, 1},
+                                                          {h_v * d_v * s_kv, d_v * s_kv, s_kv, 1},
+                                                          {h_q * d_v, d_v, b * h_q * d_v, 1},
+                                                          {1, 1, 1, 1},
+                                                          {1, 1, 1, 1},
+                                                          {h_q * d_v, d_v, override_b * h_q * d_v, 1}};
+    REQUIRE(graph->execute(handle, variant_pack_2, workspace.devPtr, override_uids, override_shapes, override_strides)
+                .is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd.cpp
new file mode 100644
index 00000000..76e4e155
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd.cpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward"
+
+This example shows how to construct a sdpa forward graph.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph(int64_t const b,
+                          int64_t const h_q,
+                          int64_t const h_k,
+                          int64_t const h_v,
+                          int64_t const s_q,
+                          int64_t const s_kv,
+                          int64_t const d_qk,
+                          int64_t const d_v,
+                          float const attn_scale    = 1.0f,
+                          bool const generate_stats = true,
+                          bool const causal_mask    = false,
+                          bool const padding_mask   = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward", "[graph][sdpa][flash][forward]") {
+    int64_t b           = 3;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = true;
+    bool padding_mask   = (cudnnGetVersion() >= 8903);
+
+    if (cudnnGetVersion() < 8903) {
+        SKIP("Test requires cudnn 8.9.3 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph(
+        b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v, attn_scale, generate_stats, causal_mask, padding_mask);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    Surface<float> statsTensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = statsTensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_paged_decode_and_prefill.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_paged_decode_and_prefill.cpp
new file mode 100644
index 00000000..6e03c916
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_paged_decode_and_prefill.cpp
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+#include <random>
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward paged caches decode and prefill"
+
+This example shows how to construct a sdpa forward graph with paged caches.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+#define PAGE_TABLE_K_UID 9
+#define PAGE_TABLE_V_UID 10
+
+#define Q_RAGGED_OFFSET_UID 11
+
+enum class phase_t { prefill, decode };
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph_with_paged_caches(int64_t const b,
+                                            int64_t const h_q,
+                                            int64_t const h_k,
+                                            int64_t const h_v,
+                                            int64_t const s_q,
+                                            int64_t const s_kv,
+                                            int64_t const d_qk,
+                                            int64_t const d_v,
+                                            int64_t const block_size,
+                                            int64_t const num_blocks_k,
+                                            int64_t const num_blocks_v,
+                                            int64_t const table_size,
+                                            bool is_ragged,
+                                            float const attn_scale    = 1.0f,
+                                            bool const generate_stats = true,
+                                            bool const causal_mask    = false,
+                                            bool const alibi_mask     = false,
+                                            bool has_attn_bias        = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    std::shared_ptr<fe::graph::Tensor_attributes> q_ragged = nullptr;
+
+    auto Q_attrs = fe::graph::Tensor_attributes()
+                       .set_name("Q")
+                       .set_uid(Q_UID)
+                       .set_dim({b, h_q, s_q, d_qk})
+                       .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1});
+
+    if (is_ragged) {
+        q_ragged = graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("Q_ragged_offset")
+                                     .set_uid(Q_RAGGED_OFFSET_UID)
+                                     .set_data_type(fe::DataType_t::INT32)
+                                     .set_dim({b + 1, 1, 1, 1})
+                                     .set_stride({1, 1, 1, 1}));
+        Q_attrs.set_ragged_offset(q_ragged);
+    }
+
+    auto Q = graph->tensor(Q_attrs);
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("container_K")
+                               .set_uid(K_UID)
+                               .set_dim({num_blocks_k, h_k, block_size, d_qk})
+                               .set_stride({h_k * block_size * d_qk, block_size * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("container_V")
+                               .set_uid(V_UID)
+                               .set_dim({num_blocks_v, h_v, block_size, d_v})
+                               .set_stride({h_v * block_size * d_v, block_size * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_alibi_mask(alibi_mask)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+    }
+
+    // Setup padding mask
+    auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("seq_q")
+                                   .set_uid(SEQ_LEN_Q_UID)
+                                   .set_dim({b, 1, 1, 1})
+                                   .set_stride({1, 1, 1, 1})
+                                   .set_data_type(fe::DataType_t::INT32));
+    auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("seq_kv")
+                                    .set_uid(SEQ_LEN_KV_UID)
+                                    .set_dim({b, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+    sdpa_options.set_padding_mask(true).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+
+    auto page_table_k = graph->tensor(fe::graph::Tensor_attributes()
+                                          .set_name("page_table_k")
+                                          .set_uid(PAGE_TABLE_K_UID)
+                                          .set_dim({b, 1, table_size, 1})
+                                          .set_stride({{table_size, table_size, 1, 1}})
+                                          .set_data_type(fe::DataType_t::INT32));
+    auto page_table_v = graph->tensor(fe::graph::Tensor_attributes()
+                                          .set_name("page_table_v")
+                                          .set_uid(PAGE_TABLE_V_UID)
+                                          .set_dim({b, 1, table_size, 1})
+                                          .set_stride({{table_size, table_size, 1, 1}})
+                                          .set_data_type(fe::DataType_t::INT32));
+
+    sdpa_options.set_paged_attention_k_table(page_table_k);
+    sdpa_options.set_paged_attention_v_table(page_table_v);
+    sdpa_options.set_paged_attention_max_seq_len_kv(static_cast<int>(s_kv));
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+void
+test_case(phase_t phase) {
+    int64_t b    = 8;          // batch size
+    int64_t h_q  = 8;          // head dim
+    int64_t h_k  = 1;          // head dim
+    int64_t h_v  = 1;          // head dim
+    int64_t s_kv = 32 * 1024;  // k and v tensor is padded to this seq length
+    int64_t s_q =
+        phase == phase_t::prefill ? 512 : 1;  // q tensor is padded to this seq length or ragged pointer is set
+    int64_t d_qk            = 128;            // hidden dim
+    int64_t d_v             = 128;            // hidden dim
+    int64_t block_size      = 1;              // block size for paged attention
+    int64_t num_blocks_k    = ((s_kv + block_size - 1) / block_size) * b;  // Number of blocks in container_k
+    int64_t num_blocks_v    = ((s_kv + block_size - 1) / block_size) * b;  // Number of blocks in container_v
+    int64_t page_table_size = (s_kv + block_size - 1) / block_size;        // per-batch size of the page tables
+    bool generate_stats     = true;
+    float attn_scale        = 0.123f;
+    bool causal_mask        = true;
+    bool alibi_mask         = false;
+    bool has_attn_bias      = false;
+
+    bool is_ragged = (phase == phase_t::prefill);
+    if (cudnnGetVersion() < 90800 && is_ragged) {
+        SKIP("Test requires cudnn 9.8.0 or above");
+        return;
+    }
+
+    else if (cudnnGetVersion() < 90500) {
+        SKIP("Test requires cudnn 9.5.0 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph_with_paged_caches(b,
+                                                             h_q,
+                                                             h_k,
+                                                             h_v,
+                                                             s_q,
+                                                             s_kv,
+                                                             d_qk,
+                                                             d_v,
+                                                             block_size,
+                                                             num_blocks_k,
+                                                             num_blocks_v,
+                                                             page_table_size,
+                                                             is_ragged,
+                                                             attn_scale,
+                                                             generate_stats,
+                                                             causal_mask,
+                                                             alibi_mask,
+                                                             has_attn_bias);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_container_tensor(num_blocks_k * h_k * d_qk * block_size, false);
+    Surface<half> v_container_tensor(num_blocks_v * h_v * d_v * block_size, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    Surface<int32_t> page_table_k_tensor(b * page_table_size, false);
+    Surface<int32_t> page_table_v_tensor(b * page_table_size, false);
+
+    std::vector<int32_t> host_page_table_k(b * page_table_size);
+    std::vector<int32_t> host_page_table_v(b * page_table_size);
+
+    // Initialize the page tables
+    std::mt19937 rng;
+    std::uniform_int_distribution<int32_t> distribution(0, int32_t(std::min(num_blocks_k, num_blocks_v)) - 1);
+
+    for (auto& elem : host_page_table_k) {
+        elem = distribution(rng);
+    }
+    for (auto& elem : host_page_table_v) {
+        elem = distribution(rng);
+    }
+
+    CUDA_CHECK(cudaMemcpy(page_table_k_tensor.devPtr,
+                          host_page_table_k.data(),
+                          sizeof(host_page_table_k[0]) * b,
+                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(page_table_v_tensor.devPtr,
+                          host_page_table_v.data(),
+                          sizeof(host_page_table_v[0]) * b,
+                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    std::unordered_map<int64_t, void*> variant_pack = {{Q_UID, q_tensor.devPtr},
+                                                       {K_UID, k_container_tensor.devPtr},
+                                                       {V_UID, v_container_tensor.devPtr},
+                                                       {O_UID, o_tensor.devPtr},
+                                                       {PAGE_TABLE_K_UID, page_table_k_tensor.devPtr},
+                                                       {PAGE_TABLE_V_UID, page_table_v_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID] = bias_tensor.devPtr;
+    }
+
+    // Create variable sequence lengths
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+
+    std::vector<int32_t> cumulative_SeqlenQ(b + 1, 0);
+    std::vector<int32_t> hostActualSeqlenQ(b);
+    std::vector<int32_t> hostActualSeqlenKV(b);
+    for (auto i = 0; i < b; ++i) {
+        hostActualSeqlenQ[i]      = phase == phase_t::decode ? 1 : ((i & 0x1) ? 16 : 32);
+        hostActualSeqlenKV[i]     = rand() % s_kv;
+        cumulative_SeqlenQ[i + 1] = cumulative_SeqlenQ[i] + hostActualSeqlenQ[i];
+    }
+
+    Surface<int32_t> ragged_offset_Q(b + 1, false);
+    if (is_ragged) {
+        std::vector<int32_t> q_ragged_offsets(b + 1);
+        for (auto i = 0; i < b + 1; ++i) {
+            q_ragged_offsets[i] = static_cast<int32_t>(cumulative_SeqlenQ[i] * h_q * d_qk);
+        }
+        CUDA_CHECK(cudaMemcpy(
+            ragged_offset_Q.devPtr, q_ragged_offsets.data(), sizeof(int32_t) * (b + 1), cudaMemcpyHostToDevice));
+        variant_pack[Q_RAGGED_OFFSET_UID] = ragged_offset_Q.devPtr;
+    }
+
+    CUDA_CHECK(cudaMemcpy(
+        devActualSeqlenQ.devPtr, hostActualSeqlenQ.data(), sizeof(hostActualSeqlenQ[0]) * b, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                          hostActualSeqlenKV.data(),
+                          sizeof(hostActualSeqlenKV[0]) * b,
+                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+    variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+
+    Surface<float> statsTensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = statsTensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    for (auto i = 0u; i < 100; i++) {
+        REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+TEST_CASE("Toy sdpa forward paged caches decode and prefill", "[graph][sdpa][flash][paged][forward]") {
+    std::cout << "Running prefill phase" << std::endl;
+    test_case(phase_t::prefill);
+
+    std::cout << "Running decode phase" << std::endl;
+    test_case(phase_t::decode);
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_block_mask.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_block_mask.cpp
new file mode 100644
index 00000000..69befc79
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_block_mask.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward with sink"
+
+This example shows how to construct a sdpa forward graph with sink token.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+#define BLOCK_MASK_UID 9
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph_with_block_mask(int64_t const b,
+                                          int64_t const h_q,
+                                          int64_t const h_k,
+                                          int64_t const h_v,
+                                          int64_t const s_q,
+                                          int64_t const s_kv,
+                                          int64_t const d_qk,
+                                          int64_t const d_v,
+                                          float const attn_scale    = 1.0f,
+                                          bool const generate_stats = true,
+                                          bool const causal_mask    = false,
+                                          bool const alibi_mask     = false,
+                                          bool const padding_mask   = false,
+                                          bool const has_attn_bias  = false,
+                                          bool const has_block_mask = false,
+                                          int const TILE_M          = 128,
+                                          int const TILE_N          = 128) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_alibi_mask(alibi_mask)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+    }
+
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    if (has_block_mask) {
+        auto block_mask =
+            graph->tensor(fe::graph::Tensor_attributes()
+                              .set_name("block_mask")
+                              .set_uid(BLOCK_MASK_UID)
+                              .set_dim({b, h_q, (s_q + TILE_M - 1) / TILE_M, ((s_kv + TILE_N - 1) / TILE_N + 7) / 8})
+                              .set_stride({h_q * (s_q + TILE_M - 1) / TILE_M * ((s_kv + TILE_N - 1) / TILE_N + 7) / 8,
+                                           (s_q + TILE_M - 1) / TILE_M * ((s_kv + TILE_N - 1) / TILE_N + 7) / 8,
+                                           ((s_kv + TILE_N - 1) / TILE_N + 7) / 8,
+                                           1})
+                              .set_data_type(fe::DataType_t::UINT8));
+        sdpa_options.set_block_mask(block_mask);
+    }
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with block mask", "[graph][sdpa][flash][block_mask][forward]") {
+    int64_t b           = 3;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = false;
+    bool padding_mask   = false;
+    bool alibi_mask     = false;
+    bool has_attn_bias  = false;
+    bool has_block_mask = true;
+    int TILE_M          = 128;
+    int TILE_N          = 128;
+
+    if (cudnnGetVersion() < 91400) {
+        SKIP("Test requires cudnn 9.14 or above");
+        return;
+    }
+
+    if (!is_blackwell_arch()) {
+        SKIP("Block mask in unified SDPA node requires Blackwell");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph_with_block_mask(b,
+                                                           h_q,
+                                                           h_k,
+                                                           h_v,
+                                                           s_q,
+                                                           s_kv,
+                                                           d_qk,
+                                                           d_v,
+                                                           attn_scale,
+                                                           generate_stats,
+                                                           causal_mask,
+                                                           alibi_mask,
+                                                           padding_mask,
+                                                           has_attn_bias,
+                                                           has_block_mask,
+                                                           TILE_M,
+                                                           TILE_N);
+
+    // Supported starting 9.14+
+    auto status = graph->validate();
+    REQUIRE(status.is_good());
+
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(graph->check_support(handle).is_good());
+    REQUIRE(graph->build_plans(handle).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID] = bias_tensor.devPtr;
+    }
+
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = stats_tensor.devPtr;
+    }
+
+    Surface<float> block_mask_tensor(b * h_q * (s_q + TILE_M - 1) / TILE_M * ((s_kv + TILE_N - 1) / TILE_N + 7) / 8,
+                                     false);
+    if (has_block_mask) {
+        variant_pack[BLOCK_MASK_UID] = block_mask_tensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_cudagraphs.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_cudagraphs.cpp
new file mode 100644
index 00000000..f7b09e1a
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_cudagraphs.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF  MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../utils/helpers.h"
+#include <catch2/catch_test_macros.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward as CUDA graph"
+
+This example shows how to construct a sdpa forward graph
+as a CUDA graph, then instantiate and execute it in a simple way.
+
+For an example showing how to construct the CUDA graph as a
+child of a larger CUDA graph, see ../misc/cudagraphs.cpp.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+
+// Declare the function from fp16_fwd.cpp
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph(int64_t const b,
+                          int64_t const h_q,
+                          int64_t const h_k,
+                          int64_t const h_v,
+                          int64_t const s_q,
+                          int64_t const s_kv,
+                          int64_t const d_qk,
+                          int64_t const d_v,
+                          float const attn_scale    = 1.0f,
+                          bool const generate_stats = true,
+                          bool const causal_mask    = false,
+                          bool const padding_mask   = false);
+
+// Convenience class to encapsulate SDPA test data for this example
+class SdpaTestData {
+   public:
+    SdpaTestData(int64_t const b,
+                 int64_t const h_q,
+                 int64_t const h_k,
+                 int64_t const h_v,
+                 int64_t const s_q,
+                 int64_t const s_kv,
+                 int64_t const d_qk,
+                 int64_t const d_v,
+                 int64_t const workspace_size,
+                 bool const generate_stats,
+                 bool const padding_mask,
+                 float const qkv_fill_value)
+        : q_tensor(b * h_q * s_q * d_qk, false, cpu_float2half_rn(qkv_fill_value)),
+          k_tensor(b * h_k * d_qk * s_kv, false, cpu_float2half_rn(qkv_fill_value)),
+          v_tensor(b * h_v * d_v * s_kv, false, cpu_float2half_rn(qkv_fill_value)),
+          o_tensor(b * s_q * h_q * d_qk, false),
+          bias_tensor(b * 1 * s_q * s_kv, false),
+          devActualSeqlenQ(b, false, /*fillValue=*/20),
+          devActualSeqlenKV(b, false, /*fillValue=*/20),
+          statsTensor(b * h_q * s_q * 1, false),
+          workspace(workspace_size, false),
+          generate_stats_(generate_stats),
+          padding_mask_(padding_mask) {}
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void *>
+    build_variant_pack() {
+        std::unordered_map<fe::graph::Tensor_attributes::uid_t, void *> variant_pack;
+        variant_pack[Q_UID] = q_tensor.devPtr;
+        variant_pack[K_UID] = k_tensor.devPtr;
+        variant_pack[V_UID] = v_tensor.devPtr;
+        variant_pack[O_UID] = o_tensor.devPtr;
+        if (padding_mask_) {
+            variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+            variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+        }
+        if (generate_stats_ == true) {
+            variant_pack[STATS_UID] = statsTensor.devPtr;
+        }
+        return variant_pack;
+    }
+
+    void *
+    get_workspace_ptr() {
+        return workspace.devPtr;
+    }
+
+    void
+    sync_outputs() {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        CUDA_CHECK(cudaMemcpy(
+            o_tensor.hostPtr, o_tensor.devPtr, sizeof(o_tensor.hostPtr[0]) * o_tensor.n_elems, cudaMemcpyDeviceToHost));
+        if (generate_stats_ == true) {
+            CUDA_CHECK(cudaMemcpy(statsTensor.hostPtr,
+                                  statsTensor.devPtr,
+                                  sizeof(statsTensor.hostPtr[0]) * statsTensor.n_elems,
+                                  cudaMemcpyDeviceToHost));
+        }
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    template <typename T>
+    bool
+    equal_tensors(Surface<T> &a, Surface<T> &b) {
+        REQUIRE(a.n_elems == b.n_elems);
+        for (int i = 0; i < a.n_elems; i++) {
+            if (a.hostPtr[i] != b.hostPtr[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool
+    equal_outputs(SdpaTestData &other) {
+        REQUIRE(generate_stats_ == other.generate_stats_);
+        sync_outputs();
+        other.sync_outputs();
+        if (!equal_tensors(o_tensor, other.o_tensor)) return false;
+        if (generate_stats_ && !equal_tensors(statsTensor, other.statsTensor)) return false;
+        return true;
+    }
+
+   private:
+    Surface<half> q_tensor;
+    Surface<half> k_tensor;
+    Surface<half> v_tensor;
+    Surface<half> o_tensor;
+    Surface<half> bias_tensor;
+    Surface<int32_t> devActualSeqlenQ;
+    Surface<int32_t> devActualSeqlenKV;
+    Surface<float> statsTensor;
+    Surface<int8_t> workspace;
+    bool generate_stats_;
+    bool padding_mask_;
+};
+
+TEST_CASE("Toy sdpa forward as CUDA graph", "[graph][sdpa][flash][forward][cudagraph]") {
+    // cuDNN only supports native CUDA graphs in CUDA 12.0 and above.
+    // Because the below test depends on some CUDA graph APIs that changed
+    // between CUDA 11.x and 12.0, it wouldn't even compile in <12.0 anyway,
+    // so we just disable the whole test by #if in that case.
+    // Also check the CUDA version at runtime, for good measure.
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
+    // cuDNN only supports native CUDA graphs for sdpa in 9.6 or above.
+    if (cudnnGetVersion() < 90600) {
+        SKIP("Test requires cudnn 9.6.0 or above");
+        return;
+    }
+
+    int64_t b           = 3;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = true;
+    bool padding_mask   = (cudnnGetVersion() >= 8903);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph(
+        b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v, attn_scale, generate_stats, causal_mask, padding_mask);
+
+    // Validate the graph and lower the FE graph to BE graph
+    REQUIRE(graph->validate().is_good());
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    REQUIRE(graph->create_execution_plans({cudnn_frontend::HeurMode_t::A}).is_good());
+
+    // Make sure the selected execution plan supports cuda graph
+    graph->select_behavior_notes({cudnn_frontend::BehaviorNote_t::SUPPORTS_CUDA_GRAPH_NATIVE_API});
+    REQUIRE(graph->check_support().is_good());
+    REQUIRE(graph->build_plans().is_good());
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+
+    //// Create a CUDA graph.
+    // The responsibility to destroy is on the user.
+    cudaGraph_t cudnn_cuda_graph;
+    CUDA_CHECK(cudaGraphCreate(&cudnn_cuda_graph, 0));  // 0 is just what the API says to pass
+
+    // Create the first variant pack.
+    SdpaTestData test_data_1(b,
+                             h_q,
+                             h_k,
+                             h_v,
+                             s_q,
+                             s_kv,
+                             d_qk,
+                             d_v,
+                             workspace_size,
+                             generate_stats,
+                             padding_mask,
+                             /*fillValue_qkv=*/1.1f);
+    auto variant_pack_1 = test_data_1.build_variant_pack();
+
+    // Populate and instantiate the graph, then launch it.
+    REQUIRE(graph->populate_cuda_graph(handle, variant_pack_1, test_data_1.get_workspace_ptr(), cudnn_cuda_graph)
+                .is_good());
+    cudaGraphExec_t cuda_graph_exec;
+    CUDA_CHECK(cudaGraphInstantiate(&cuda_graph_exec, cudnn_cuda_graph, NULL, NULL, 0));
+    CUDA_CHECK(cudaGraphLaunch(cuda_graph_exec, 0));
+
+    // Functional correctness:
+    // Execute the SDPA directly and check that the results are the same as using a CUDA graph.
+    SdpaTestData test_data_2(test_data_1);
+    auto variant_pack_2 = test_data_2.build_variant_pack();
+    REQUIRE(graph->execute(handle, variant_pack_2, test_data_2.get_workspace_ptr()).is_good());
+    REQUIRE(test_data_1.equal_outputs(test_data_2));
+
+    // Update the existing CUDA graph with different data.
+    SdpaTestData test_data_3(b,
+                             h_q,
+                             h_k,
+                             h_v,
+                             s_q,
+                             s_kv,
+                             d_qk,
+                             d_v,
+                             workspace_size,
+                             generate_stats,
+                             padding_mask,
+                             /*fillValue_qkv=*/1.3f);
+    auto variant_pack_3 = test_data_3.build_variant_pack();
+    REQUIRE(
+        graph->update_cuda_graph(handle, variant_pack_3, test_data_3.get_workspace_ptr(), cudnn_cuda_graph).is_good());
+    cudaGraphExecUpdateResultInfo result_info;
+    CUDA_CHECK(cudaGraphExecUpdate(cuda_graph_exec, cudnn_cuda_graph, &result_info));
+    CUDA_CHECK(cudaGraphLaunch(cuda_graph_exec, 0));
+
+    // Functional correctness:
+    // Execute the SDPA directly and check that the results are the same as using a CUDA graph.
+    SdpaTestData test_data_4(test_data_3);
+    auto variant_pack_4 = test_data_4.build_variant_pack();
+    REQUIRE(graph->execute(handle, variant_pack_4, test_data_4.get_workspace_ptr()).is_good());
+    REQUIRE(test_data_3.equal_outputs(test_data_4));
+
+    // Because original and updated graph have different inputs, their outputs should *not* match
+    REQUIRE(!test_data_1.equal_outputs(test_data_3));
+
+    //// Cleanup
+    CUDA_CHECK(cudaGraphExecDestroy(cuda_graph_exec));
+    CUDA_CHECK(cudaGraphDestroy(cudnn_cuda_graph));
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp
new file mode 100644
index 00000000..831bac17
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward with dropout"
+
+This example shows how to construct a sdpa forward graph.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define DROPOUT_SCALE_UID 7
+#define DROPOUT_MASK_UID 8
+
+std::shared_ptr<fe::graph::Graph> static create_sdpa_forward_graph_with_custom_dropout(int64_t const b,
+                                                                                       int64_t const h_q,
+                                                                                       int64_t const h_k,
+                                                                                       int64_t const h_v,
+                                                                                       int64_t const s_q,
+                                                                                       int64_t const s_kv,
+                                                                                       int64_t const d_qk,
+                                                                                       int64_t const d_v,
+                                                                                       float const attn_scale    = 1.0f,
+                                                                                       bool const generate_stats = true,
+                                                                                       bool const causal_mask = false,
+                                                                                       bool has_attn_bias     = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+    }
+
+    auto dropout_scale = graph->tensor(fe::graph::Tensor_attributes()
+                                           .set_dim({1, 1, 1, 1})
+                                           .set_stride({1, 1, 1, 1})
+                                           .set_name("dropout_scale")
+                                           .set_is_pass_by_value(true)
+                                           .set_data_type(fe::DataType_t::FLOAT)
+                                           .set_uid(DROPOUT_SCALE_UID));
+
+    auto dropout_mask = graph->tensor(fe::graph::Tensor_attributes()
+                                          .set_dim({b, h_q, s_q, s_kv})
+                                          .set_stride({h_q * s_q * s_kv, s_q * s_kv, s_kv, 1})
+                                          .set_name("dropout_mask")
+                                          .set_data_type(fe::DataType_t::BFLOAT16)
+                                          .set_uid(DROPOUT_MASK_UID));
+
+    sdpa_options.set_dropout(dropout_scale, dropout_mask);
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with dropout", "[graph][sdpa][flash][forward]") {
+    int64_t b           = 3;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = true;
+    bool has_attn_bias  = (cudnnGetVersion() >= 8903);
+
+    if (cudnnGetVersion() < 8903) {
+        SKIP("Test requires cudnn 8.9.3 or above");
+        return;
+    }
+
+    // switch off certain features on blackwell
+    if (is_blackwell_arch()) {
+        SKIP("Providing a custom dropout is not supported on blackwell");
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph_with_custom_dropout(
+        b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v, attn_scale, generate_stats, causal_mask, has_attn_bias);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID] = bias_tensor.devPtr;
+    }
+
+    float dropout_scale = 0.1f;
+    Surface<half> dropout_mask_tensor(b * h_q * s_q * s_kv, false);
+    variant_pack[DROPOUT_MASK_UID]  = dropout_mask_tensor.devPtr;
+    variant_pack[DROPOUT_SCALE_UID] = &dropout_scale;
+
+    Surface<float> statsTensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = statsTensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp
new file mode 100644
index 00000000..3b8843da
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <tuple>
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+
+static std::shared_ptr<fe::graph::Tensor_attributes>
+soft_cap(std::shared_ptr<fe::graph::Graph> graph,
+         std::shared_ptr<fe::graph::Tensor_attributes> attention_score,
+         std::shared_ptr<fe::graph::Tensor_attributes> soft_cap_scalar) {
+    auto mul_out = graph->pointwise(
+        attention_score,
+        soft_cap_scalar,
+        fe::graph::Pointwise_attributes().set_name("div_by_soft_cap").set_mode(fe::PointwiseMode_t::DIV));
+
+    auto tanh_out = graph->pointwise(
+        mul_out, fe::graph::Pointwise_attributes().set_name("activation").set_mode(fe::PointwiseMode_t::TANH_FWD));
+
+    auto out = graph->pointwise(
+        tanh_out,
+        soft_cap_scalar,
+        fe::graph::Pointwise_attributes().set_name("mul_by_soft_cap").set_mode(fe::PointwiseMode_t::MUL));
+
+    return out;
+}
+
+[[maybe_unused]] static std::shared_ptr<fe::graph::Tensor_attributes>
+softcap_and_bias_mask(std::shared_ptr<fe::graph::Graph> graph,
+                      std::shared_ptr<fe::graph::Tensor_attributes> attention_score,
+                      std::shared_ptr<fe::graph::Tensor_attributes> bias_,
+                      std::shared_ptr<fe::graph::Tensor_attributes> soft_cap_sclar_) {
+    auto bias_out     = fe::graph::attn::score_modifiers::bias(graph, attention_score, bias_);
+    auto soft_cap_out = soft_cap(graph, bias_out, soft_cap_sclar_);
+
+    return soft_cap_out;
+}
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph(int64_t const b,
+                          int64_t const h_q,
+                          int64_t const h_k,
+                          int64_t const h_v,
+                          int64_t const s_q,
+                          int64_t const s_kv,
+                          int64_t const d_qk,
+                          int64_t const d_v,
+                          float const attn_scale    = 1.0f,
+                          bool const generate_stats = true,
+                          bool has_attn_bias        = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto soft_cap_scalar = graph->tensor(0.8f);
+
+    std::shared_ptr<fe::graph::Tensor_attributes> bias = nullptr;
+    if (has_attn_bias) {
+        bias = graph->tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_uid(BIAS_UID)
+                                 .set_dim({b, 1, s_q, s_kv})
+                                 .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+    }
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_attn_scale(attn_scale);
+    if (has_attn_bias) {
+        sdpa_options.set_score_mod(
+            std::bind(softcap_and_bias_mask, std::placeholders::_1, std::placeholders::_2, bias, soft_cap_scalar));
+
+    } else {
+        sdpa_options.set_score_mod(std::bind(soft_cap, std::placeholders::_1, std::placeholders::_2, soft_cap_scalar));
+    }
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with flexible graph", "[graph][sdpa][flash][forward][flex_attention]") {
+    int64_t b           = 16;    // batch size
+    int64_t h_q         = 32;    // head dim
+    int64_t h_k         = 32;    // head dim
+    int64_t h_v         = 32;    // head dim
+    int64_t s_q         = 2048;  // q tensor is padded to this seq length
+    int64_t s_kv        = 2048;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+
+    bool has_attn_bias = true;
+
+    if (cudnnGetVersion() < 90400) {
+        SKIP("Test requires cudnn 9.4.0 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph =
+        create_sdpa_forward_graph(b, h_q, h_k, h_v, s_q, s_kv, d_qk, d_v, attn_scale, generate_stats, has_attn_bias);
+
+    REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID] = bias_tensor.devPtr;
+    }
+
+    Surface<float> statsTensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = statsTensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_max_and_sum_exp.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_max_and_sum_exp.cpp
new file mode 100644
index 00000000..7e9017c3
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_max_and_sum_exp.cpp
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward with sink"
+
+This example shows how to construct a sdpa forward graph with sink token.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define MAX_UID 6
+#define SUM_EXP_UID 7
+#define BIAS_UID 8
+#define SEQ_LEN_Q_UID 9
+#define SEQ_LEN_KV_UID 10
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph_with_max_and_sum_exp(int64_t const b,
+                                               int64_t const h_q,
+                                               int64_t const h_k,
+                                               int64_t const h_v,
+                                               int64_t const s_q,
+                                               int64_t const s_kv,
+                                               int64_t const d_qk,
+                                               int64_t const d_v,
+                                               float const attn_scale      = 1.0f,
+                                               bool const generate_stats   = true,
+                                               bool const generate_max     = true,
+                                               bool const generate_sum_exp = true,
+                                               bool const causal_mask      = false,
+                                               bool const alibi_mask       = false,
+                                               bool const padding_mask     = false,
+                                               bool const has_attn_bias    = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_alibi_mask(alibi_mask)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+    }
+
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    if (generate_max) {
+        auto Max = graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("Max")
+                                     .set_uid(MAX_UID)
+                                     .set_dim({b, h_q, s_q, 1})
+                                     .set_stride({h_q * s_q, s_q, 1, 1})
+                                     .set_data_type(fe::DataType_t::FLOAT));
+        sdpa_options.set_logit_max(Max);
+    }
+
+    if (generate_sum_exp) {
+        auto Sum_exp = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("Sum_exp")
+                                         .set_uid(SUM_EXP_UID)
+                                         .set_dim({b, h_q, s_q, 1})
+                                         .set_stride({h_q * s_q, s_q, 1, 1})
+                                         .set_data_type(fe::DataType_t::FLOAT));
+        sdpa_options.set_score_sum_exp(Sum_exp);
+    }
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, std::move(sdpa_options));
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with max and sum exp", "[graph][sdpa][flash][forward]") {
+    int64_t b             = 3;     // batch size
+    int64_t h_q           = 4;     // head dim
+    int64_t h_k           = 4;     // head dim
+    int64_t h_v           = 4;     // head dim
+    int64_t s_q           = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv          = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk          = 128;   // hidden dim
+    int64_t d_v           = 128;   // hidden dim
+    bool generate_stats   = false;
+    bool generate_max     = true;
+    bool generate_sum_exp = true;
+    float attn_scale      = 0.123f;
+    bool causal_mask      = true;
+    bool padding_mask     = (cudnnGetVersion() >= 8903);
+    bool alibi_mask       = (cudnnGetVersion() >= 8904);
+    bool has_attn_bias    = (cudnnGetVersion() >= 8903);
+    if (cudnnGetVersion() < 91300) {
+        SKIP("Test requires cudnn 9.13 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph_with_max_and_sum_exp(b,
+                                                                h_q,
+                                                                h_k,
+                                                                h_v,
+                                                                s_q,
+                                                                s_kv,
+                                                                d_qk,
+                                                                d_v,
+                                                                attn_scale,
+                                                                generate_stats,
+                                                                generate_max,
+                                                                generate_sum_exp,
+                                                                causal_mask,
+                                                                alibi_mask,
+                                                                padding_mask,
+                                                                has_attn_bias);
+
+    // Supported starting 9.13+
+    auto status = graph->validate();
+    if ((cudnnGetVersion() >= 91300)) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(graph->check_support(handle).is_good());
+    REQUIRE(graph->build_plans(handle).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID] = bias_tensor.devPtr;
+    }
+
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+    Surface<float> max_tensor(b * h_q * s_q * 1, false);
+    Surface<float> sum_exp_tensor(b * h_q * s_q * 1, false);
+    if (generate_max == true) {
+        variant_pack[MAX_UID] = max_tensor.devPtr;
+    }
+    if (generate_sum_exp == true) {
+        variant_pack[SUM_EXP_UID] = sum_exp_tensor.devPtr;
+    }
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = stats_tensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp
new file mode 100644
index 00000000..02eb1a1b
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+#include <random>
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward with paged caches"
+
+This example shows how to construct a sdpa forward graph with paged caches.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+#define PAGE_TABLE_K_UID 9
+#define PAGE_TABLE_V_UID 10
+
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph_with_paged_caches(int64_t const b,
+                                            int64_t const h_q,
+                                            int64_t const h_k,
+                                            int64_t const h_v,
+                                            int64_t const s_q,
+                                            int64_t const s_kv,
+                                            int64_t const d_qk,
+                                            int64_t const d_v,
+                                            int64_t const block_size,
+                                            int64_t const num_blocks_k,
+                                            int64_t const num_blocks_v,
+                                            int64_t const table_size,
+                                            float const attn_scale    = 1.0f,
+                                            bool const generate_stats = true,
+                                            bool const causal_mask    = false,
+                                            bool const alibi_mask     = false,
+                                            bool has_attn_bias        = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("container_K")
+                               .set_uid(K_UID)
+                               .set_dim({num_blocks_k, h_k, block_size, d_qk})
+                               .set_stride({h_k * block_size * d_qk, block_size * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("container_V")
+                               .set_uid(V_UID)
+                               .set_dim({num_blocks_v, h_v, block_size, d_v})
+                               .set_stride({h_v * block_size * d_v, block_size * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_alibi_mask(alibi_mask)
+                            .set_causal_mask(causal_mask)
+                            .set_attn_scale(attn_scale);
+
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+    }
+
+    // Setup padding mask
+    auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                   .set_name("seq_q")
+                                   .set_uid(SEQ_LEN_Q_UID)
+                                   .set_dim({b, 1, 1, 1})
+                                   .set_stride({1, 1, 1, 1})
+                                   .set_data_type(fe::DataType_t::INT32));
+    auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                    .set_name("seq_kv")
+                                    .set_uid(SEQ_LEN_KV_UID)
+                                    .set_dim({b, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+    sdpa_options.set_padding_mask(true).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+
+    auto page_table_k = graph->tensor(fe::graph::Tensor_attributes()
+                                          .set_name("page_table_k")
+                                          .set_uid(PAGE_TABLE_K_UID)
+                                          .set_dim({b, 1, table_size, 1})
+                                          .set_stride({{table_size, table_size, 1, 1}})
+                                          .set_data_type(fe::DataType_t::INT32));
+    auto page_table_v = graph->tensor(fe::graph::Tensor_attributes()
+                                          .set_name("page_table_v")
+                                          .set_uid(PAGE_TABLE_V_UID)
+                                          .set_dim({b, 1, table_size, 1})
+                                          .set_stride({{table_size, table_size, 1, 1}})
+                                          .set_data_type(fe::DataType_t::INT32));
+
+    sdpa_options.set_paged_attention_k_table(page_table_k);
+    sdpa_options.set_paged_attention_v_table(page_table_v);
+    sdpa_options.set_paged_attention_max_seq_len_kv(static_cast<int>(s_kv));
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * s_q * d_v, s_q * d_v, d_v, 1}).set_uid(O_UID);
+    // O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with paged caches", "[graph][sdpa][flash][paged][forward]") {
+    std::vector<int64_t> b    = {1, 2, 3, 4, 5, 6, 7, 8};  // batch size
+    int64_t h_q               = 8;                         // head dim
+    int64_t h_k               = 1;                         // head dim
+    int64_t h_v               = 1;                         // head dim
+    int64_t s_q               = 1;                         // q tensor is padded to this seq length
+    std::vector<int64_t> s_kv = {1 * 1024, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024, 32 * 1024};  // k and
+    int64_t d_qk              = 128;                                                             // hidden dim
+    int64_t d_v               = 128;                                                             // hidden dim
+    int64_t block_size        = 1;  // block size for paged attention (i.e page size)
+
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = false;
+    bool alibi_mask     = false;
+    bool has_attn_bias  = false;
+
+    if (cudnnGetVersion() < 90500) {
+        SKIP("Test requires cudnn 9.5.0 or above");
+        return;
+    }
+
+    cudnnHandle_t handle;
+    CUDNN_CHECK(cudnnCreate(&handle));
+
+    for (auto j = 0u; j < s_kv.size(); ++j) {
+        for (auto i = 0u; i < b.size(); ++i) {
+            int64_t num_blocks_k = ((s_kv[j] + block_size - 1) / block_size) * b[i];  // Number of blocks in container_k
+            int64_t num_blocks_v = ((s_kv[j] + block_size - 1) / block_size) * b[i];  // Number of blocks in container_v
+            int64_t page_table_size = (s_kv[j] + block_size - 1) / block_size;  // per-batch size of the page tables
+
+            auto graph = create_sdpa_forward_graph_with_paged_caches(b[i],
+                                                                     h_q,
+                                                                     h_k,
+                                                                     h_v,
+                                                                     s_q,
+                                                                     s_kv[j],
+                                                                     d_qk,
+                                                                     d_v,
+                                                                     block_size,
+                                                                     num_blocks_k,
+                                                                     num_blocks_v,
+                                                                     page_table_size,
+                                                                     attn_scale,
+                                                                     generate_stats,
+                                                                     causal_mask,
+                                                                     alibi_mask,
+                                                                     has_attn_bias);
+
+            REQUIRE(graph->build(handle, {fe::HeurMode_t::A}).is_good());
+
+            //// Build variant pack
+            Surface<half> q_tensor(b[i] * h_q * s_q * d_qk, false);
+            Surface<half> k_container_tensor(num_blocks_k * h_k * d_qk * block_size, false);
+            Surface<half> v_container_tensor(num_blocks_v * h_v * d_v * block_size, false);
+
+            Surface<half> o_tensor(b[i] * s_q * h_q * d_qk, false);
+
+            Surface<int32_t> page_table_k_tensor(b[i] * page_table_size, false);
+            Surface<int32_t> page_table_v_tensor(b[i] * page_table_size, false);
+
+            std::vector<int32_t> host_page_table_k(b[i] * page_table_size);
+            std::vector<int32_t> host_page_table_v(b[i] * page_table_size);
+
+            // Initialize the page tables
+            std::mt19937 rng;
+            std::uniform_int_distribution<int32_t> distribution(0, int32_t(std::min(num_blocks_k, num_blocks_v)) - 1);
+
+            for (auto& elem : host_page_table_k) {
+                elem = distribution(rng);
+            }
+            for (auto& elem : host_page_table_v) {
+                elem = distribution(rng);
+            }
+
+            CUDA_CHECK(cudaMemcpy(page_table_k_tensor.devPtr,
+                                  host_page_table_k.data(),
+                                  sizeof(int32_t) * host_page_table_k.size(),
+                                  cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaMemcpy(page_table_v_tensor.devPtr,
+                                  host_page_table_v.data(),
+                                  sizeof(int32_t) * host_page_table_v.size(),
+                                  cudaMemcpyHostToDevice));
+            CUDA_CHECK(cudaDeviceSynchronize());
+
+            std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+                {Q_UID, q_tensor.devPtr},
+                {K_UID, k_container_tensor.devPtr},
+                {V_UID, v_container_tensor.devPtr},
+                {O_UID, o_tensor.devPtr},
+                {PAGE_TABLE_K_UID, page_table_k_tensor.devPtr},
+                {PAGE_TABLE_V_UID, page_table_v_tensor.devPtr}};
+
+            Surface<half> bias_tensor(b[i] * 1 * s_q * s_kv[j], false);
+            if (has_attn_bias) {
+                variant_pack[BIAS_UID] = bias_tensor.devPtr;
+            }
+
+            Surface<float> statsTensor(b[i] * h_q * s_q * 1, false);
+            if (generate_stats == true) {
+                variant_pack[STATS_UID] = statsTensor.devPtr;
+            }
+
+            auto benchmark = [&]() -> bool {
+                const int iter_count = 1000;
+
+                int64_t workspace_size = 0;
+                REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+                Surface<int8_t> workspace(workspace_size, false);
+
+                // Create variable sequence lengths
+                Surface<int32_t> devActualSeqlenQ(b[i], false);
+                Surface<int32_t> devActualSeqlenKV(b[i], false);
+                std::vector<int32_t> hostActualSeqlenQ(b[i], 1);
+                std::vector<int32_t> hostActualSeqlenKV(b[i], static_cast<int32_t>(s_kv[j]));
+
+                CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                                      hostActualSeqlenQ.data(),
+                                      sizeof(int32_t) * hostActualSeqlenQ.size(),
+                                      cudaMemcpyHostToDevice));
+                CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                                      hostActualSeqlenKV.data(),
+                                      sizeof(int32_t) * hostActualSeqlenKV.size(),
+                                      cudaMemcpyHostToDevice));
+                CUDA_CHECK(cudaDeviceSynchronize());
+
+                variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+                variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+
+                cudaEvent_t start, stop;
+                cudaEventCreate(&start);
+                cudaEventCreate(&stop);
+                cudaDeviceSynchronize();
+
+                cudaStream_t stream = nullptr;
+                cudnnGetStream(handle, &stream);
+
+                float execution_times = .0f;
+
+                float time_ms = 0.0f;
+
+                // Warm-up run
+                auto warmup_status = graph->execute(handle, variant_pack, workspace.devPtr);
+
+                if (warmup_status.is_bad()) {
+                    std::cout << "Plan failed execution " << warmup_status.get_message() << std::endl;
+                    return false;
+                }
+                cudaDeviceSynchronize();
+
+                cudaEventRecord(start, stream);
+                for (int iter = 0; iter < iter_count; iter++) {
+                    auto status = graph->execute(handle, variant_pack, workspace.devPtr);
+                    (void)status;
+                }
+                cudaEventRecord(stop, stream);
+                cudaEventSynchronize(stop);
+                cudaEventElapsedTime(&time_ms, start, stop);
+
+                execution_times = time_ms / iter_count;
+                std::cout << "Batch " << b[i] << " s_kv " << s_kv[j] << " took " << execution_times << " ms."
+                          << std::endl;
+
+                return true;
+            };
+
+            REQUIRE(benchmark() == true);
+
+            CUDA_CHECK(cudaDeviceSynchronize());
+        }
+    }
+
+    cudnnDestroy(handle);
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_sink_token.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_sink_token.cpp
new file mode 100644
index 00000000..6a8f9a9f
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp16_fwd_with_sink_token.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+
+#include <cudnn_frontend.h>
+namespace fe = cudnn_frontend;
+
+/*
+Run this example by using command:
+bin/samples "Toy sdpa forward with sink"
+
+This example shows how to construct a sdpa forward graph with sink token.
+*/
+
+// Tensors in forward pass
+#define Q_UID 1
+#define K_UID 2
+#define V_UID 3
+#define O_UID 4
+#define STATS_UID 5
+#define BIAS_UID 6
+#define SEQ_LEN_Q_UID 7
+#define SEQ_LEN_KV_UID 8
+#define SINK_TOKEN_UID 9
+std::shared_ptr<fe::graph::Graph>
+create_sdpa_forward_graph_with_sink_token(int64_t const b,
+                                          int64_t const h_q,
+                                          int64_t const h_k,
+                                          int64_t const h_v,
+                                          int64_t const s_q,
+                                          int64_t const s_kv,
+                                          int64_t const d_qk,
+                                          int64_t const d_v,
+                                          float const attn_scale    = 1.0f,
+                                          bool const generate_stats = true,
+                                          bool const causal_mask    = false,
+                                          bool const alibi_mask     = false,
+                                          bool const padding_mask   = false,
+                                          bool const has_attn_bias  = false,
+                                          bool const has_sink_token = false) {
+    // Create a graph and set common global properties.
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::BFLOAT16)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("Q")
+                               .set_uid(Q_UID)
+                               .set_dim({b, h_q, s_q, d_qk})
+                               .set_stride({h_q * s_q * d_qk, s_q * d_qk, d_qk, 1}));
+
+    auto K = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("K")
+                               .set_uid(K_UID)
+                               .set_dim({b, h_k, s_kv, d_qk})
+                               .set_stride({h_k * s_kv * d_qk, s_kv * d_qk, d_qk, 1}));
+
+    auto V = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("V")
+                               .set_uid(V_UID)
+                               .set_dim({b, h_v, s_kv, d_v})
+                               .set_stride({h_v * s_kv * d_v, s_kv * d_v, d_v, 1}));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(generate_stats)
+                            .set_alibi_mask(alibi_mask)
+                            .set_attn_scale(attn_scale);
+
+    if (causal_mask) {
+        sdpa_options.set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+            .set_diagonal_band_right_bound(0);
+    }
+
+    if (has_attn_bias) {
+        auto bias = graph->tensor(fe::graph::Tensor_attributes()
+                                      .set_name("bias")
+                                      .set_uid(BIAS_UID)
+                                      .set_dim({b, 1, s_q, s_kv})
+                                      .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+        sdpa_options.set_bias(bias);
+    }
+
+    if (padding_mask) {
+        auto seq_q  = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("seq_q")
+                                       .set_uid(SEQ_LEN_Q_UID)
+                                       .set_dim({b, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::INT32));
+        auto seq_kv = graph->tensor(fe::graph::Tensor_attributes()
+                                        .set_name("seq_kv")
+                                        .set_uid(SEQ_LEN_KV_UID)
+                                        .set_dim({b, 1, 1, 1})
+                                        .set_stride({1, 1, 1, 1})
+                                        .set_data_type(fe::DataType_t::INT32));
+        sdpa_options.set_padding_mask(padding_mask).set_seq_len_q(seq_q).set_seq_len_kv(seq_kv);
+    }
+
+    if (has_sink_token) {
+        auto sink_token = graph->tensor(fe::graph::Tensor_attributes()
+                                            .set_name("sink_token")
+                                            .set_uid(SINK_TOKEN_UID)
+                                            .set_dim({1, h_q, 1, 1})
+                                            .set_stride({h_q, 1, 1, 1})
+                                            .set_data_type(fe::DataType_t::FLOAT));
+        sdpa_options.set_sink_token(sink_token);
+    }
+
+    auto [O, Stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h_q, s_q, d_v}).set_stride({h_q * d_v, d_v, b * h_q * d_v, 1}).set_uid(O_UID);
+
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_uid(STATS_UID);
+    } else {
+        assert(Stats == nullptr);
+    }
+
+    return graph;
+}
+
+TEST_CASE("Toy sdpa forward with sink", "[graph][sdpa][flash][forward]") {
+    int64_t b           = 3;     // batch size
+    int64_t h_q         = 4;     // head dim
+    int64_t h_k         = 4;     // head dim
+    int64_t h_v         = 4;     // head dim
+    int64_t s_q         = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv        = 1024;  // k and v tensor is padded to this seq length
+    int64_t d_qk        = 128;   // hidden dim
+    int64_t d_v         = 128;   // hidden dim
+    bool generate_stats = true;
+    float attn_scale    = 0.123f;
+    bool causal_mask    = true;
+    bool padding_mask   = (cudnnGetVersion() >= 8903);
+    bool alibi_mask     = (cudnnGetVersion() >= 8904);
+    bool has_attn_bias  = (cudnnGetVersion() >= 8903);
+    bool has_sink_token = true;
+    if (cudnnGetVersion() < 91300) {
+        SKIP("Test requires cudnn 9.13 or above");
+        return;
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto graph = create_sdpa_forward_graph_with_sink_token(b,
+                                                           h_q,
+                                                           h_k,
+                                                           h_v,
+                                                           s_q,
+                                                           s_kv,
+                                                           d_qk,
+                                                           d_v,
+                                                           attn_scale,
+                                                           generate_stats,
+                                                           causal_mask,
+                                                           alibi_mask,
+                                                           padding_mask,
+                                                           has_attn_bias,
+                                                           has_sink_token);
+
+    // Supported starting 9.13+
+    auto status = graph->validate();
+    if ((cudnnGetVersion() >= 91300)) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(graph->check_support(handle).is_good());
+    REQUIRE(graph->build_plans(handle).is_good());
+
+    //// Build variant pack
+    Surface<half> q_tensor(b * h_q * s_q * d_qk, false);
+    Surface<half> k_tensor(b * h_k * d_qk * s_kv, false);
+    Surface<half> v_tensor(b * h_v * d_v * s_kv, false);
+
+    Surface<half> o_tensor(b * s_q * h_q * d_qk, false);
+
+    std::unordered_map<fe::graph::Tensor_attributes::uid_t, void*> variant_pack = {
+        {Q_UID, q_tensor.devPtr}, {K_UID, k_tensor.devPtr}, {V_UID, v_tensor.devPtr}, {O_UID, o_tensor.devPtr}};
+
+    Surface<half> bias_tensor(b * 1 * s_q * s_kv, false);
+    if (has_attn_bias) {
+        variant_pack[BIAS_UID] = bias_tensor.devPtr;
+    }
+
+    Surface<int32_t> devActualSeqlenQ(b, false);
+    Surface<int32_t> devActualSeqlenKV(b, false);
+    if (padding_mask) {
+        std::vector<int32_t> hostActualSeqlenQ(b, 20);
+        std::vector<int32_t> hostActualSeqlenKV(b, 20);
+
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenQ.devPtr,
+                              hostActualSeqlenQ.data(),
+                              sizeof(hostActualSeqlenQ[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(devActualSeqlenKV.devPtr,
+                              hostActualSeqlenKV.data(),
+                              sizeof(hostActualSeqlenKV[0]) * b,
+                              cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+
+        variant_pack[SEQ_LEN_Q_UID]  = devActualSeqlenQ.devPtr;
+        variant_pack[SEQ_LEN_KV_UID] = devActualSeqlenKV.devPtr;
+    }
+
+    Surface<float> stats_tensor(b * h_q * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[STATS_UID] = stats_tensor.devPtr;
+    }
+
+    Surface<float> sink_token_tensor(1 * h_q * 1 * 1, false);
+    if (has_sink_token) {
+        variant_pack[SINK_TOKEN_UID] = sink_token_tensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd.cpp
new file mode 100644
index 00000000..c07e9c29
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd.cpp
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+TEST_CASE("sdpa_fp8_bprop", "[graph][sdpa][fp8][backward]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    if (!is_hopper_arch() && !is_blackwell_computing_arch()) {
+        SKIP("sdpa fp8: Sample requires Hopper or Blackwell Computing GPU");
+        return;
+    }
+
+    int64_t b = 2;    // batch size
+    int64_t h = 2;    // head dim
+    int64_t s = 512;  // q,k,v tensor is padded to this seq length
+    int64_t d = 128;  // hidden dim
+
+    // bs3hd
+    auto Q_dQ_O_dO_dims = std::vector<int64_t>({b, h, s, d});
+    // QKV_strides
+    auto Q_dQ_strides = std::vector<int64_t>({s * 3 * h * d, d, 3 * h * d, 1});  // bs3hd
+
+    auto Q_K_V_dQ_dK_dV_bulk_strides = std::vector<int64_t>({s * 3 * h * d, 3 * h * d, h * d, d, 1});
+
+    auto O_dO_strides = std::vector<int64_t>({s * h * d, d, h * d, 1});  // bshd
+
+    auto K_V_dK_dV_dims{Q_dQ_O_dO_dims};
+    auto K_V_dK_dV_strides{Q_dQ_strides};
+
+    auto MZ_OdO_dims    = std::vector<int64_t>({b, h, s, 1});
+    auto MZ_OdO_strides = std::vector<int64_t>({h * s, s, 1, 1});
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(fe::DataType_t::FP8_E4M3)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("Q").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto K = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("K").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto V = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("V").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto O =
+        mha_graph.tensor(fe::graph::Tensor_attributes().set_name("O").set_dim(Q_dQ_O_dO_dims).set_stride(O_dO_strides));
+    auto dO = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("dO").set_dim(Q_dQ_O_dO_dims).set_stride(O_dO_strides));
+    auto Stats = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Stats")
+                                      .set_dim(MZ_OdO_dims)
+                                      .set_stride(MZ_OdO_strides)
+                                      .set_data_type(fe::DataType_t::FLOAT));
+
+    float attn_scale = 0.123f;
+
+    auto descale_q  = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k  = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v  = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s  = mha_graph.tensor_like(descale_q, "Descale_S");
+    auto descale_o  = mha_graph.tensor_like(descale_q, "Descale_O");
+    auto descale_dO = mha_graph.tensor_like(descale_q, "Descale_dO");
+    auto descale_dP = mha_graph.tensor_like(descale_q, "Descale_dP");
+
+    auto scale_s  = mha_graph.tensor_like(descale_q, "Scale_S");
+    auto scale_dP = mha_graph.tensor_like(descale_q, "Scale_dP");
+    auto scale_dQ = mha_graph.tensor_like(descale_q, "Scale_dQ");
+    auto scale_dK = mha_graph.tensor_like(descale_q, "Scale_dK");
+    auto scale_dV = mha_graph.tensor_like(descale_q, "Scale_dV");
+
+    // options/attributes
+    auto sdpa_fp8_backwards_options = fe::graph::SDPA_fp8_backward_attributes()
+                                          .set_name("sdpa_fp8_backward")
+                                          .set_causal_mask(true)
+                                          .set_attn_scale(attn_scale);
+
+    // output
+    auto [dQ, dK, dV, Amax_dQ, Amax_dK, Amax_dV, Amax_dP] = mha_graph.sdpa_fp8_backward(Q,
+                                                                                        K,
+                                                                                        V,
+                                                                                        O,
+                                                                                        dO,
+                                                                                        Stats,
+                                                                                        descale_q,
+                                                                                        descale_k,
+                                                                                        descale_v,
+                                                                                        descale_o,
+                                                                                        descale_dO,
+                                                                                        descale_s,
+                                                                                        descale_dP,
+                                                                                        scale_s,
+                                                                                        scale_dQ,
+                                                                                        scale_dK,
+                                                                                        scale_dV,
+                                                                                        scale_dP,
+                                                                                        sdpa_fp8_backwards_options);
+
+    dQ->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides);
+    dK->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides);
+    dV->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides);
+    Amax_dQ->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dK->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dV->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dP->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+    auto status     = mha_graph.validate();
+    if ((cudnnGetVersion() >= 90100) && check_device_arch_newer_than("hopper")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    REQUIRE(mha_graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(mha_graph.check_support().is_good());
+    REQUIRE(mha_graph.build_plans().is_good());
+
+    // Surfaces
+    auto Q_K_V_dQ_dK_dV_bulk_dims{b * s * 3 * h * d};
+    auto dO_O_dims{b * s * h * d};
+    Surface<int8_t> qkvTensor{Q_K_V_dQ_dK_dV_bulk_dims, false};
+    void* devPtrQ{qkvTensor.devPtr};
+    void* devPtrK{qkvTensor.devPtr + h * d};
+    void* devPtrV{qkvTensor.devPtr + 2 * h * d};
+
+    Surface<int8_t> dQdKdVTensor{Q_K_V_dQ_dK_dV_bulk_dims, false};
+    void* devPtrdQ{dQdKdVTensor.devPtr};
+    void* devPtrdK{dQdKdVTensor.devPtr + h * d};
+    void* devPtrdV{dQdKdVTensor.devPtr + 2 * h * d};
+
+    Surface<int8_t> dOTensor{dO_O_dims, false};
+    Surface<int8_t> OTensor{dO_O_dims, false};
+
+    Surface<float> descale_Q_Tensor{1, false};
+    Surface<float> descale_K_Tensor{1, false};
+    Surface<float> descale_V_Tensor{1, false};
+    Surface<float> descale_S_Tensor{1, false};
+    Surface<float> descale_dP_Tensor{1, false};
+    Surface<float> descale_dO_Tensor{1, false};
+    Surface<float> descale_O_Tensor{1, false};
+
+    Surface<float> scale_S_Tensor{1, false};
+    Surface<float> scale_dQ_Tensor{1, false};
+    Surface<float> scale_dK_Tensor{1, false};
+    Surface<float> scale_dV_Tensor{1, false};
+    Surface<float> scale_dP_Tensor{1, false};
+
+    Surface<float> AMax_dQ_Tensor{1, false};
+    Surface<float> AMax_dK_Tensor{1, false};
+    Surface<float> AMax_dV_Tensor{1, false};
+    Surface<float> AMax_dP_Tensor{1, false};
+
+    Surface<float> StatsTensor(b * h * s * 1, false);
+
+    // Variant pack
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack{
+        {Q, devPtrQ},
+        {K, devPtrK},
+        {V, devPtrV},
+        {O, OTensor.devPtr},
+        {dO, dOTensor.devPtr},
+        {dQ, devPtrdQ},
+        {dK, devPtrdK},
+        {dV, devPtrdV},
+        {descale_q, descale_Q_Tensor.devPtr},
+        {descale_k, descale_K_Tensor.devPtr},
+        {descale_v, descale_V_Tensor.devPtr},
+        {descale_o, descale_O_Tensor.devPtr},
+        {descale_dO, descale_dO_Tensor.devPtr},
+        {descale_s, descale_S_Tensor.devPtr},
+        {descale_dP, descale_dP_Tensor.devPtr},
+        {scale_s, scale_S_Tensor.devPtr},
+        {scale_dQ, scale_dQ_Tensor.devPtr},
+        {scale_dK, scale_dK_Tensor.devPtr},
+        {scale_dV, scale_dV_Tensor.devPtr},
+        {scale_dP, scale_dP_Tensor.devPtr},
+        {Stats, StatsTensor.devPtr},
+        {Amax_dQ, AMax_dQ_Tensor.devPtr},
+        {Amax_dK, AMax_dK_Tensor.devPtr},
+        {Amax_dV, AMax_dV_Tensor.devPtr},
+        {Amax_dP, AMax_dP_Tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+TEST_CASE("sdpa_fp8_gqa_bprop", "[graph][sdpa][fp8][backward]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    if (!is_hopper_arch() && !is_blackwell_computing_arch()) {
+        SKIP("sdpa fp8: Sample requires Hopper or Blackwell Computing GPU");
+        return;
+    }
+
+    int64_t b    = 2;    // batch size
+    int64_t h_qo = 12;   // query/output head dim
+    int64_t h_kv = 4;    // key/value head dim
+    int64_t s    = 512;  // q,k,v tensor is padded to this seq length
+    int64_t d    = 128;  // hidden dim
+
+    // construct graph
+    std::vector<int64_t> qo_dim    = {b, h_qo, s, d};
+    std::vector<int64_t> kv_dim    = {b, h_kv, s, d};
+    std::vector<int64_t> qo_stride = {s * h_qo * d, d, h_qo * d, 1};  // bshd
+    std::vector<int64_t> kv_stride = {s * h_kv * d, d, h_kv * d, 1};  // bshd
+
+    std::vector<int64_t> stats_dim    = {b, h_qo, s, 1};
+    std::vector<int64_t> stats_stride = {h_qo * s, s, 1, 1};
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(fe::DataType_t::FP8_E4M3)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto q     = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("Q").set_dim(qo_dim).set_stride(qo_stride));
+    auto k     = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("K").set_dim(kv_dim).set_stride(kv_stride));
+    auto v     = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("V").set_dim(kv_dim).set_stride(kv_stride));
+    auto o     = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("O").set_dim(qo_dim).set_stride(qo_stride));
+    auto dO    = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("dO").set_dim(qo_dim).set_stride(qo_stride));
+    auto stats = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Stats")
+                                      .set_dim(stats_dim)
+                                      .set_stride(stats_stride)
+                                      .set_data_type(fe::DataType_t::FLOAT));
+
+    float attn_scale = 0.125f;
+
+    auto descale_q  = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k  = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v  = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s  = mha_graph.tensor_like(descale_q, "Descale_S");
+    auto descale_o  = mha_graph.tensor_like(descale_q, "Descale_O");
+    auto descale_dO = mha_graph.tensor_like(descale_q, "Descale_dO");
+    auto descale_dP = mha_graph.tensor_like(descale_q, "Descale_dP");
+
+    auto scale_s  = mha_graph.tensor_like(descale_q, "Scale_S");
+    auto scale_dP = mha_graph.tensor_like(descale_q, "Scale_dP");
+    auto scale_dQ = mha_graph.tensor_like(descale_q, "Scale_dQ");
+    auto scale_dK = mha_graph.tensor_like(descale_q, "Scale_dK");
+    auto scale_dV = mha_graph.tensor_like(descale_q, "Scale_dV");
+
+    // clang-format off
+    auto [dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP] = mha_graph.sdpa_fp8_backward(
+        q, k, v, o, dO, stats,
+        descale_q, descale_k, descale_v, descale_o, descale_dO, descale_s, descale_dP,
+        scale_s, scale_dQ, scale_dK, scale_dV, scale_dP,
+        fe::graph::SDPA_fp8_backward_attributes().set_name("sdpa_fp8_backward")
+                                                 .set_causal_mask(true)
+                                                 .set_attn_scale(attn_scale)
+    );
+    // clang-format on
+
+    dQ->set_output(true).set_dim(qo_dim).set_stride(qo_stride);
+    dK->set_output(true).set_dim(kv_dim).set_stride(kv_stride);
+    dV->set_output(true).set_dim(kv_dim).set_stride(kv_stride);
+    amax_dQ->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    amax_dK->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    amax_dV->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    amax_dP->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+    auto status     = mha_graph.validate();
+    if ((cudnnGetVersion() >= 90100) && check_device_arch_newer_than("hopper")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    REQUIRE(mha_graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(mha_graph.check_support().is_good());
+    REQUIRE(mha_graph.build_plans().is_good());
+
+    // Surfaces that alllocate GPU memory
+    Surface<int8_t> q_gpu(b * s * h_qo * d, false);
+    Surface<int8_t> k_gpu(b * s * h_kv * d, false);
+    Surface<int8_t> v_gpu(b * s * h_kv * d, false);
+    Surface<int8_t> o_gpu(b * s * h_qo * d, false);
+
+    Surface<float> stats_gpu(b * h_qo * s * 1, false);
+
+    Surface<int8_t> dQ_gpu(b * s * h_qo * d, false);
+    Surface<int8_t> dK_gpu(b * s * h_kv * d, false);
+    Surface<int8_t> dV_gpu(b * s * h_kv * d, false);
+    Surface<int8_t> dO_gpu(b * s * h_qo * d, false);
+
+    Surface<float> descale_q_gpu(1, false);
+    Surface<float> descale_k_gpu(1, false);
+    Surface<float> descale_v_gpu(1, false);
+    Surface<float> descale_o_gpu(1, false);
+    Surface<float> descale_s_gpu(1, false);
+    Surface<float> descale_dP_gpu(1, false);
+    Surface<float> descale_dO_gpu(1, false);
+
+    Surface<float> scale_s_gpu(1, false);
+    Surface<float> scale_dQ_gpu(1, false);
+    Surface<float> scale_dK_gpu(1, false);
+    Surface<float> scale_dV_gpu(1, false);
+    Surface<float> scale_dP_gpu(1, false);
+
+    Surface<float> amax_dQ_gpu(1, false);
+    Surface<float> amax_dK_gpu(1, false);
+    Surface<float> amax_dV_gpu(1, false);
+    Surface<float> amax_dP_gpu(1, false);
+
+    // Variant pack
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack{
+        {q, q_gpu.devPtr},
+        {k, k_gpu.devPtr},
+        {v, v_gpu.devPtr},
+        {o, o_gpu.devPtr},
+
+        {dQ, dQ_gpu.devPtr},
+        {dK, dK_gpu.devPtr},
+        {dV, dV_gpu.devPtr},
+        {dO, dO_gpu.devPtr},
+
+        {stats, stats_gpu.devPtr},
+
+        {descale_q, descale_q_gpu.devPtr},
+        {descale_k, descale_k_gpu.devPtr},
+        {descale_v, descale_v_gpu.devPtr},
+        {descale_o, descale_o_gpu.devPtr},
+        {descale_s, descale_s_gpu.devPtr},
+        {descale_dP, descale_dP_gpu.devPtr},
+        {descale_dO, descale_dO_gpu.devPtr},
+
+        {scale_s, scale_s_gpu.devPtr},
+        {scale_dQ, scale_dQ_gpu.devPtr},
+        {scale_dK, scale_dK_gpu.devPtr},
+        {scale_dV, scale_dV_gpu.devPtr},
+        {scale_dP, scale_dP_gpu.devPtr},
+
+        {amax_dQ, amax_dQ_gpu.devPtr},
+        {amax_dK, amax_dK_gpu.devPtr},
+        {amax_dV, amax_dV_gpu.devPtr},
+        {amax_dP, amax_dP_gpu.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd_bottom_right_causal_mask.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd_bottom_right_causal_mask.cpp
new file mode 100644
index 00000000..fd5a4209
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd_bottom_right_causal_mask.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+TEST_CASE("sdpa_fp8_bprop_brcm", "[graph][sdpa][fp8][backward][brcm]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    int64_t b = 2;    // batch size
+    int64_t h = 2;    // head dim
+    int64_t s = 512;  // q,k,v tensor is padded to this seq length
+    int64_t d = 128;  // hidden dim
+
+    // bs3hd
+    auto Q_dQ_O_dO_dims = std::vector<int64_t>({b, h, s, d});
+    // QKV_strides
+    auto Q_dQ_strides = std::vector<int64_t>({s * 3 * h * d, d, 3 * h * d, 1});  // bs3hd
+
+    auto Q_K_V_dQ_dK_dV_bulk_strides = std::vector<int64_t>({s * 3 * h * d, 3 * h * d, h * d, d, 1});
+
+    auto O_dO_strides = std::vector<int64_t>({s * h * d, d, h * d, 1});  // bshd
+
+    auto K_V_dK_dV_dims{Q_dQ_O_dO_dims};
+    auto K_V_dK_dV_strides{Q_dQ_strides};
+
+    auto MZ_OdO_dims    = std::vector<int64_t>({b, h, s, 1});
+    auto MZ_OdO_strides = std::vector<int64_t>({h * s, s, 1, 1});
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(fe::DataType_t::FP8_E4M3)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("Q").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto K = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("K").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto V = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("V").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto O =
+        mha_graph.tensor(fe::graph::Tensor_attributes().set_name("O").set_dim(Q_dQ_O_dO_dims).set_stride(O_dO_strides));
+    auto dO = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("dO").set_dim(Q_dQ_O_dO_dims).set_stride(O_dO_strides));
+    auto Stats = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Stats")
+                                      .set_dim(MZ_OdO_dims)
+                                      .set_stride(MZ_OdO_strides)
+                                      .set_data_type(fe::DataType_t::FLOAT));
+
+    float attn_scale = 0.123f;
+
+    auto descale_q  = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k  = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v  = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s  = mha_graph.tensor_like(descale_q, "Descale_S");
+    auto descale_o  = mha_graph.tensor_like(descale_q, "Descale_O");
+    auto descale_dO = mha_graph.tensor_like(descale_q, "Descale_dO");
+    auto descale_dP = mha_graph.tensor_like(descale_q, "Descale_dP");
+
+    auto scale_s  = mha_graph.tensor_like(descale_q, "Scale_S");
+    auto scale_dP = mha_graph.tensor_like(descale_q, "Scale_dP");
+    auto scale_dQ = mha_graph.tensor_like(descale_q, "Scale_dQ");
+    auto scale_dK = mha_graph.tensor_like(descale_q, "Scale_dK");
+    auto scale_dV = mha_graph.tensor_like(descale_q, "Scale_dV");
+
+    // options/attributes
+    auto sdpa_fp8_backwards_options = fe::graph::SDPA_fp8_backward_attributes()
+                                          .set_name("sdpa_fp8_backward")
+                                          .set_causal_mask_bottom_right(true)
+                                          .set_attn_scale(attn_scale);
+
+    // output
+    auto [dQ, dK, dV, Amax_dQ, Amax_dK, Amax_dV, Amax_dP] = mha_graph.sdpa_fp8_backward(Q,
+                                                                                        K,
+                                                                                        V,
+                                                                                        O,
+                                                                                        dO,
+                                                                                        Stats,
+                                                                                        descale_q,
+                                                                                        descale_k,
+                                                                                        descale_v,
+                                                                                        descale_o,
+                                                                                        descale_dO,
+                                                                                        descale_s,
+                                                                                        descale_dP,
+                                                                                        scale_s,
+                                                                                        scale_dQ,
+                                                                                        scale_dK,
+                                                                                        scale_dV,
+                                                                                        scale_dP,
+                                                                                        sdpa_fp8_backwards_options);
+
+    dQ->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides);
+    dK->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides);
+    dV->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides);
+    Amax_dQ->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dK->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dV->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dP->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto status = mha_graph.validate();
+    if ((cudnnGetVersion() >= 90700) && check_device_arch_newer_than("blackwell")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    REQUIRE(mha_graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(mha_graph.check_support().is_good());
+    REQUIRE(mha_graph.build_plans().is_good());
+
+    // Surfaces
+    auto Q_K_V_dQ_dK_dV_bulk_dims{b * s * 3 * h * d};
+    auto dO_O_dims{b * s * h * d};
+    Surface<int8_t> qkvTensor{Q_K_V_dQ_dK_dV_bulk_dims, false};
+    void* devPtrQ{qkvTensor.devPtr};
+    void* devPtrK{qkvTensor.devPtr + h * d};
+    void* devPtrV{qkvTensor.devPtr + 2 * h * d};
+
+    Surface<int8_t> dQdKdVTensor{Q_K_V_dQ_dK_dV_bulk_dims, false};
+    void* devPtrdQ{dQdKdVTensor.devPtr};
+    void* devPtrdK{dQdKdVTensor.devPtr + h * d};
+    void* devPtrdV{dQdKdVTensor.devPtr + 2 * h * d};
+
+    Surface<int8_t> dOTensor{dO_O_dims, false};
+    Surface<int8_t> OTensor{dO_O_dims, false};
+
+    Surface<float> descale_Q_Tensor{1, false};
+    Surface<float> descale_K_Tensor{1, false};
+    Surface<float> descale_V_Tensor{1, false};
+    Surface<float> descale_S_Tensor{1, false};
+    Surface<float> descale_dP_Tensor{1, false};
+    Surface<float> descale_dO_Tensor{1, false};
+    Surface<float> descale_O_Tensor{1, false};
+
+    Surface<float> scale_S_Tensor{1, false};
+    Surface<float> scale_dQ_Tensor{1, false};
+    Surface<float> scale_dK_Tensor{1, false};
+    Surface<float> scale_dV_Tensor{1, false};
+    Surface<float> scale_dP_Tensor{1, false};
+
+    Surface<float> AMax_dQ_Tensor{1, false};
+    Surface<float> AMax_dK_Tensor{1, false};
+    Surface<float> AMax_dV_Tensor{1, false};
+    Surface<float> AMax_dP_Tensor{1, false};
+
+    Surface<float> StatsTensor(b * h * s * 1, false);
+
+    // Variant pack
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack{
+        {Q, devPtrQ},
+        {K, devPtrK},
+        {V, devPtrV},
+        {O, OTensor.devPtr},
+        {dO, dOTensor.devPtr},
+        {dQ, devPtrdQ},
+        {dK, devPtrdK},
+        {dV, devPtrdV},
+        {descale_q, descale_Q_Tensor.devPtr},
+        {descale_k, descale_K_Tensor.devPtr},
+        {descale_v, descale_V_Tensor.devPtr},
+        {descale_o, descale_O_Tensor.devPtr},
+        {descale_dO, descale_dO_Tensor.devPtr},
+        {descale_s, descale_S_Tensor.devPtr},
+        {descale_dP, descale_dP_Tensor.devPtr},
+        {scale_s, scale_S_Tensor.devPtr},
+        {scale_dQ, scale_dQ_Tensor.devPtr},
+        {scale_dK, scale_dK_Tensor.devPtr},
+        {scale_dV, scale_dV_Tensor.devPtr},
+        {scale_dP, scale_dP_Tensor.devPtr},
+        {Stats, StatsTensor.devPtr},
+        {Amax_dQ, AMax_dQ_Tensor.devPtr},
+        {Amax_dK, AMax_dK_Tensor.devPtr},
+        {Amax_dV, AMax_dV_Tensor.devPtr},
+        {Amax_dP, AMax_dP_Tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd_with_current_scaling.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd_with_current_scaling.cpp
new file mode 100644
index 00000000..677436c5
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_bwd_with_current_scaling.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+TEST_CASE("sdpa_fp8_bprop_current_scaling", "[graph][sdpa][fp8][backward]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    int64_t b = 2;    // batch size
+    int64_t h = 2;    // head dim
+    int64_t s = 512;  // q,k,v tensor is padded to this seq length
+    int64_t d = 128;  // hidden dim
+
+    // bs3hd
+    auto Q_dQ_O_dO_dims = std::vector<int64_t>({b, h, s, d});
+    // QKV_strides
+    auto Q_dQ_strides = std::vector<int64_t>({s * 3 * h * d, d, 3 * h * d, 1});  // bs3hd
+
+    auto Q_K_V_dQ_dK_dV_bulk_strides = std::vector<int64_t>({s * 3 * h * d, 3 * h * d, h * d, d, 1});
+
+    auto O_dO_strides = std::vector<int64_t>({s * h * d, d, h * d, 1});  // bshd
+
+    auto K_V_dK_dV_dims{Q_dQ_O_dO_dims};
+    auto K_V_dK_dV_strides{Q_dQ_strides};
+
+    auto MZ_OdO_dims    = std::vector<int64_t>({b, h, s, 1});
+    auto MZ_OdO_strides = std::vector<int64_t>({h * s, s, 1, 1});
+
+    bool causal_mask = true;
+
+    auto input_data_type  = fe::DataType_t::FP8_E4M3;
+    auto output_data_type = fe::DataType_t::BFLOAT16;
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(input_data_type)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto Q = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("Q").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto K = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("K").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto V = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("V").set_dim(K_V_dK_dV_dims).set_stride(K_V_dK_dV_strides));
+    auto O =
+        mha_graph.tensor(fe::graph::Tensor_attributes().set_name("O").set_dim(Q_dQ_O_dO_dims).set_stride(O_dO_strides));
+    auto dO = mha_graph.tensor(
+        fe::graph::Tensor_attributes().set_name("dO").set_dim(Q_dQ_O_dO_dims).set_stride(O_dO_strides));
+    auto Stats = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Stats")
+                                      .set_dim(MZ_OdO_dims)
+                                      .set_stride(MZ_OdO_strides)
+                                      .set_data_type(fe::DataType_t::FLOAT));
+
+    float attn_scale = 0.123f;
+
+    auto descale_q  = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k  = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v  = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s  = mha_graph.tensor_like(descale_q, "Descale_S");
+    auto descale_o  = mha_graph.tensor_like(descale_q, "Descale_O");
+    auto descale_dO = mha_graph.tensor_like(descale_q, "Descale_dO");
+    auto descale_dP = mha_graph.tensor_like(descale_q, "Descale_dP");
+
+    auto scale_dP = mha_graph.tensor_like(descale_q, "Scale_dP");
+
+    // Use fixed scalars for current scaling
+    float scale_S_scalar    = 448.0f;
+    float scale_dQKV_scalar = 1.0f;
+    auto scale_s            = mha_graph.tensor(scale_S_scalar);
+    auto scale_dQKV         = mha_graph.tensor(scale_dQKV_scalar);
+
+    // options/attributes
+    auto sdpa_fp8_backwards_options = fe::graph::SDPA_fp8_backward_attributes()
+                                          .set_name("sdpa_fp8_backward")
+                                          .set_causal_mask(causal_mask)
+                                          .set_attn_scale(attn_scale);
+
+    // output
+    auto [dQ, dK, dV, Amax_dQ, Amax_dK, Amax_dV, Amax_dP] = mha_graph.sdpa_fp8_backward(Q,
+                                                                                        K,
+                                                                                        V,
+                                                                                        O,
+                                                                                        dO,
+                                                                                        Stats,
+                                                                                        descale_q,
+                                                                                        descale_k,
+                                                                                        descale_v,
+                                                                                        descale_o,
+                                                                                        descale_dO,
+                                                                                        descale_s,
+                                                                                        descale_dP,
+                                                                                        scale_s,
+                                                                                        scale_dQKV,
+                                                                                        scale_dQKV,
+                                                                                        scale_dQKV,
+                                                                                        scale_dP,
+                                                                                        sdpa_fp8_backwards_options);
+
+    dQ->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides).set_data_type(output_data_type);
+    dK->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides).set_data_type(output_data_type);
+    dV->set_output(true).set_dim(Q_dQ_O_dO_dims).set_stride(Q_dQ_strides).set_data_type(output_data_type);
+    Amax_dQ->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dK->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dV->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_dP->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Supported only on B100 starting 9.13+
+    auto status = mha_graph.validate();
+    if ((cudnnGetVersion() >= 91300) && check_device_arch_newer_than("blackwell")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        SKIP("Test requires cuDNN version 9.13.0 or above and Blackwell architecture or newer.");
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    REQUIRE(mha_graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(mha_graph.check_support().is_good());
+    REQUIRE(mha_graph.build_plans().is_good());
+
+    // Surfaces
+    auto Q_K_V_dQ_dK_dV_bulk_dims{b * s * 3 * h * d};
+    auto dO_O_dims{b * s * h * d};
+    assert(input_data_type == fe::DataType_t::FP8_E4M3 || input_data_type == fe::DataType_t::FP8_E5M2);
+    Surface<int8_t> qkvTensor{Q_K_V_dQ_dK_dV_bulk_dims, false};
+    void* devPtrQ{qkvTensor.devPtr};
+    void* devPtrK{qkvTensor.devPtr + h * d};
+    void* devPtrV{qkvTensor.devPtr + 2 * h * d};
+
+    assert(output_data_type == fe::DataType_t::BFLOAT16 || output_data_type == fe::DataType_t::HALF);
+    Surface<half> dQdKdVTensor{Q_K_V_dQ_dK_dV_bulk_dims, false};
+    void* devPtrdQ{dQdKdVTensor.devPtr};
+    void* devPtrdK{dQdKdVTensor.devPtr + h * d};
+    void* devPtrdV{dQdKdVTensor.devPtr + 2 * h * d};
+
+    Surface<int8_t> dOTensor{dO_O_dims, false};
+    Surface<int8_t> OTensor{dO_O_dims, false};
+
+    Surface<float> descale_Q_Tensor{1, false};
+    Surface<float> descale_K_Tensor{1, false};
+    Surface<float> descale_V_Tensor{1, false};
+    Surface<float> descale_S_Tensor{1, false};
+    Surface<float> descale_dP_Tensor{1, false};
+    Surface<float> descale_dO_Tensor{1, false};
+    Surface<float> descale_O_Tensor{1, false};
+
+    Surface<float> scale_dP_Tensor{1, false};
+
+    Surface<float> AMax_dQ_Tensor{1, false};
+    Surface<float> AMax_dK_Tensor{1, false};
+    Surface<float> AMax_dV_Tensor{1, false};
+    Surface<float> AMax_dP_Tensor{1, false};
+
+    Surface<float> StatsTensor(b * h * s * 1, false);
+
+    // Variant pack
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack{
+        {Q, devPtrQ},
+        {K, devPtrK},
+        {V, devPtrV},
+        {O, OTensor.devPtr},
+        {dO, dOTensor.devPtr},
+        {dQ, devPtrdQ},
+        {dK, devPtrdK},
+        {dV, devPtrdV},
+        {descale_q, descale_Q_Tensor.devPtr},
+        {descale_k, descale_K_Tensor.devPtr},
+        {descale_v, descale_V_Tensor.devPtr},
+        {descale_o, descale_O_Tensor.devPtr},
+        {descale_dO, descale_dO_Tensor.devPtr},
+        {descale_s, descale_S_Tensor.devPtr},
+        {descale_dP, descale_dP_Tensor.devPtr},
+        {scale_dP, scale_dP_Tensor.devPtr},
+        {Stats, StatsTensor.devPtr},
+        {Amax_dQ, AMax_dQ_Tensor.devPtr},
+        {Amax_dK, AMax_dK_Tensor.devPtr},
+        {Amax_dV, AMax_dV_Tensor.devPtr},
+        {Amax_dP, AMax_dP_Tensor.devPtr}};
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd.cpp
new file mode 100644
index 00000000..fb6b19ca
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+TEST_CASE("sdpa_fp8_fprop", "[graph][sdpa][fp8][forward]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    if (!is_hopper_arch() && !is_blackwell_computing_arch()) {
+        SKIP("sdpa fp8: Sample requires Hopper or Blackwell Computing GPU");
+        return;
+    }
+
+    int64_t b = 2;    // batch size
+    int64_t h = 2;    // number of heads
+    int64_t s = 512;  // q,k,v tensor is padded to this seq length
+    int64_t d = 128;  // hidden head dim
+
+    bool generate_stats = false;
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(fe::DataType_t::FP8_E4M3)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto QKVO_dims = std::vector<int64_t>({b, h, s, d});
+
+    auto QKV_strides = std::vector<int64_t>({s * 3 * h * d, d, 3 * h * d, 1});  // bs3hd
+    auto O_strides   = std::vector<int64_t>({s * h * d, d, h * d, 1});          // bhsd
+
+    auto Q = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("Q").set_dim(QKVO_dims).set_stride(QKV_strides));
+    auto K = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("K").set_dim(QKVO_dims).set_stride(QKV_strides));
+    auto V = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("V").set_dim(QKVO_dims).set_stride(QKV_strides));
+
+    float attn_scale = 0.123f;
+
+    auto descale_q = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s = mha_graph.tensor_like(descale_q, "Descale_S");
+    auto scale_s   = mha_graph.tensor_like(descale_q, "Scale_S");
+    auto scale_o   = mha_graph.tensor_like(descale_q, "Scale_O");
+
+    auto sdpa_fp8_options = fe::graph::SDPA_fp8_attributes()
+                                .set_name("sdpa_fp8")
+                                .set_generate_stats(generate_stats)
+                                .set_causal_mask(true)
+                                .set_attn_scale(attn_scale);
+
+    auto [O, Stats, Amax_S, Amax_O] =
+        mha_graph.sdpa_fp8(Q, K, V, descale_q, descale_k, descale_v, descale_s, scale_s, scale_o, sdpa_fp8_options);
+
+    O->set_output(true).set_dim(QKVO_dims).set_stride(O_strides);
+    Amax_O->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_S->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Check that Stats tensor is real, which is only when its training step
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    } else {
+        REQUIRE(Stats == nullptr);
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto status = mha_graph.validate();
+    if ((cudnnGetVersion() >= 90100) && check_device_arch_newer_than("hopper")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    auto plans = mha_graph.create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(mha_graph.check_support(handle).is_good());
+    REQUIRE(mha_graph.build_plans(handle).is_good());
+
+    //// Build variant pack
+    Surface<int8_t> qkvTensor(b * s * 3 * h * d, false);
+    Surface<int8_t> oTensor(b * s * h * d, false);
+    void* devPtrQ = qkvTensor.devPtr;
+    void* devPtrK = (qkvTensor.devPtr + h * d);
+    void* devPtrV = (qkvTensor.devPtr + 2 * h * d);
+    void* devPtrO = oTensor.devPtr;
+
+    Surface<float> descale_Q_Tensor(1, false);
+    Surface<float> descale_K_Tensor(1, false);
+    Surface<float> descale_V_Tensor(1, false);
+    Surface<float> descale_S_Tensor(1, false);
+    Surface<float> scale_S_Tensor(1, false);
+    Surface<float> scale_O_Tensor(1, false);
+    Surface<float> Amax_S_Tensor(1, false);
+    Surface<float> Amax_O_Tensor(1, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {Q, devPtrQ},
+        {K, devPtrK},
+        {V, devPtrV},
+        {O, devPtrO},
+        {descale_q, descale_Q_Tensor.devPtr},
+        {descale_k, descale_K_Tensor.devPtr},
+        {descale_v, descale_V_Tensor.devPtr},
+        {descale_s, descale_S_Tensor.devPtr},
+        {scale_s, scale_S_Tensor.devPtr},
+        {scale_o, scale_O_Tensor.devPtr},
+        {Amax_S, Amax_S_Tensor.devPtr},
+        {Amax_O, Amax_O_Tensor.devPtr}};
+
+    Surface<float> stats_tensor(b * h * s * 1, false);
+    if (generate_stats == true) {
+        variant_pack[Stats] = stats_tensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd_bottom_right_causal_mask.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd_bottom_right_causal_mask.cpp
new file mode 100644
index 00000000..44997fa4
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd_bottom_right_causal_mask.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+TEST_CASE("sdpa_fp8_fprop_brcm", "[graph][sdpa][fp8][forward][brcm]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    if (!is_hopper_arch() && !is_blackwell_computing_arch()) {
+        SKIP("sdpa fp8: Sample requires Hopper or Blackwell Computing GPU");
+        return;
+    }
+
+    int64_t b    = 2;  // batch size
+    int64_t h    = 2;  // head dim
+    int64_t s_q  = 512;
+    int64_t s_kv = 1024;
+    int64_t d    = 128;  // hidden dim
+
+    bool generate_stats = false;
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(fe::DataType_t::FP8_E4M3)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto QO_dims = std::vector<int64_t>({b, h, s_q, d});
+    auto KV_dims = std::vector<int64_t>({b, h, s_kv, d});
+
+    auto QO_strides = std::vector<int64_t>({s_q * h * d, d, h * d, 1});   // bhsd
+    auto KV_strides = std::vector<int64_t>({s_kv * h * d, d, h * d, 1});  // bhsd
+
+    auto Q = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("Q").set_dim(QO_dims).set_stride(QO_strides));
+    auto K = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("K").set_dim(KV_dims).set_stride(KV_strides));
+    auto V = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("V").set_dim(KV_dims).set_stride(KV_strides));
+
+    float attn_scale = 0.123f;
+
+    auto descale_q = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s = mha_graph.tensor_like(descale_q, "Descale_S");
+    auto scale_s   = mha_graph.tensor_like(descale_q, "Scale_S");
+    auto scale_o   = mha_graph.tensor_like(descale_q, "Scale_O");
+
+    auto sdpa_fp8_options = fe::graph::SDPA_fp8_attributes()
+                                .set_name("sdpa_fp8")
+                                .set_generate_stats(generate_stats)
+                                .set_causal_mask_bottom_right(true)
+                                .set_attn_scale(attn_scale);
+
+    auto [O, Stats, Amax_S, Amax_O] =
+        mha_graph.sdpa_fp8(Q, K, V, descale_q, descale_k, descale_v, descale_s, scale_s, scale_o, sdpa_fp8_options);
+
+    O->set_output(true).set_dim(QO_dims).set_stride(QO_strides);
+    Amax_O->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_S->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Check that Stats tensor is real, which is only when its training step
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    } else {
+        REQUIRE(Stats == nullptr);
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    auto status = mha_graph.validate();
+    if ((cudnnGetVersion() >= 90700) && check_device_arch_newer_than("blackwell")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    auto plans = mha_graph.create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(mha_graph.check_support().is_good());
+    REQUIRE(mha_graph.build_plans().is_good());
+
+    //// Build variant pack
+    Surface<int8_t> qTensor(b * s_q * h * d, false);
+    Surface<int8_t> kTensor(b * s_kv * h * d, false);
+    Surface<int8_t> vTensor(b * s_kv * h * d, false);
+    Surface<int8_t> oTensor(b * s_q * h * d, false);
+    void* devPtrQ = qTensor.devPtr;
+    void* devPtrK = kTensor.devPtr;
+    void* devPtrV = vTensor.devPtr;
+    void* devPtrO = oTensor.devPtr;
+
+    Surface<float> descale_Q_Tensor(1, false);
+    Surface<float> descale_K_Tensor(1, false);
+    Surface<float> descale_V_Tensor(1, false);
+    Surface<float> descale_S_Tensor(1, false);
+    Surface<float> scale_S_Tensor(1, false);
+    Surface<float> scale_O_Tensor(1, false);
+    Surface<float> Amax_S_Tensor(1, false);
+    Surface<float> Amax_O_Tensor(1, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {Q, devPtrQ},
+        {K, devPtrK},
+        {V, devPtrV},
+        {O, devPtrO},
+        {descale_q, descale_Q_Tensor.devPtr},
+        {descale_k, descale_K_Tensor.devPtr},
+        {descale_v, descale_V_Tensor.devPtr},
+        {descale_s, descale_S_Tensor.devPtr},
+        {scale_s, scale_S_Tensor.devPtr},
+        {scale_o, scale_O_Tensor.devPtr},
+        {Amax_S, Amax_S_Tensor.devPtr},
+        {Amax_O, Amax_O_Tensor.devPtr}};
+
+    Surface<float> stats_tensor(b * h * s_q * 1, false);
+    if (generate_stats == true) {
+        variant_pack[Stats] = stats_tensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd_current_scaling.cpp b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd_current_scaling.cpp
new file mode 100644
index 00000000..c2f71cdd
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/sdpa/fp8_fwd_current_scaling.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cuda_runtime_api.h>
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+TEST_CASE("sdpa_fp8_fprop_current_scaling", "[graph][sdpa][fp8][forward]") {
+    namespace fe = cudnn_frontend;
+
+#if CUDART_VERSION < 12000
+    SKIP("Test requires cuda toolkit 12.0 or above");
+    return;
+#endif
+
+    int64_t b = 2;    // batch size
+    int64_t h = 2;    // number of heads
+    int64_t s = 512;  // q,k,v tensor is padded to this seq length
+    int64_t d = 128;  // hidden head dim
+
+    bool generate_stats = true;
+    bool causal_mask    = true;
+
+    auto input_data_type  = fe::DataType_t::FP8_E4M3;
+    auto output_data_type = fe::DataType_t::BFLOAT16;
+
+    fe::graph::Graph mha_graph;
+    mha_graph.set_io_data_type(input_data_type)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto QKVO_dims = std::vector<int64_t>({b, h, s, d});
+
+    auto QKV_strides = std::vector<int64_t>({s * 3 * h * d, d, 3 * h * d, 1});  // bs3hd
+    auto O_strides   = std::vector<int64_t>({s * h * d, d, h * d, 1});          // bhsd
+
+    auto Q = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("Q").set_dim(QKVO_dims).set_stride(QKV_strides));
+    auto K = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("K").set_dim(QKVO_dims).set_stride(QKV_strides));
+    auto V = mha_graph.tensor(fe::graph::Tensor_attributes().set_name("V").set_dim(QKVO_dims).set_stride(QKV_strides));
+
+    float attn_scale = 0.123f;
+
+    auto descale_q = mha_graph.tensor(fe::graph::Tensor_attributes()
+                                          .set_name("Descale_Q")
+                                          .set_dim({1, 1, 1, 1})
+                                          .set_stride({1, 1, 1, 1})
+                                          .set_data_type(fe::DataType_t::FLOAT));
+    auto descale_k = mha_graph.tensor_like(descale_q, "Descale_K");
+    auto descale_v = mha_graph.tensor_like(descale_q, "Descale_V");
+    auto descale_s = mha_graph.tensor_like(descale_q, "Descale_S");
+
+    // Use fixed scalars for current scaling
+    float scale_S_scalar = 448.0f;
+    float scale_O_scalar = 1.0f;
+    auto scale_s         = mha_graph.tensor(scale_S_scalar);
+    auto scale_o         = mha_graph.tensor(scale_O_scalar);
+
+    auto sdpa_fp8_options = fe::graph::SDPA_fp8_attributes()
+                                .set_name("sdpa_fp8")
+                                .set_generate_stats(generate_stats)
+                                .set_causal_mask(causal_mask)
+                                .set_attn_scale(attn_scale);
+
+    auto [O, Stats, Amax_S, Amax_O] =
+        mha_graph.sdpa_fp8(Q, K, V, descale_q, descale_k, descale_v, descale_s, scale_s, scale_o, sdpa_fp8_options);
+
+    // For current scaling, output tensor is in higher precision
+    O->set_output(true).set_dim(QKVO_dims).set_stride(O_strides).set_data_type(output_data_type);
+
+    Amax_O->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+    Amax_S->set_output(true).set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1}).set_data_type(fe::DataType_t::FLOAT);
+
+    // Check that Stats tensor is real, which is only when its training step
+    if (generate_stats) {
+        Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    } else {
+        REQUIRE(Stats == nullptr);
+    }
+
+    // Create a unique_ptr for the cuDNN handle
+    auto handle_ptr = create_cudnn_handle();
+    auto handle     = *handle_ptr;
+
+    // Supported only on B100 starting 9.13+
+    auto status = mha_graph.validate();
+    if ((cudnnGetVersion() >= 91300) && check_device_arch_newer_than("blackwell")) {
+        REQUIRE(status.is_good());
+    } else {
+        REQUIRE(status.get_code() == fe::error_code_t::GRAPH_NOT_SUPPORTED);
+        SKIP("Test requires cuDNN version 9.13.0 or above and Blackwell architecture or newer.");
+        return;
+    }
+
+    REQUIRE(mha_graph.build_operation_graph(handle).is_good());
+    auto plans = mha_graph.create_execution_plans({fe::HeurMode_t::A});
+    REQUIRE(mha_graph.check_support(handle).is_good());
+    REQUIRE(mha_graph.build_plans(handle).is_good());
+
+    //// Build variant pack
+    assert((input_data_type == fe::DataType_t::FP8_E4M3) || (input_data_type == fe::DataType_t::FP8_E5M2));
+    Surface<int8_t> qkvTensor(b * s * 3 * h * d, false);
+
+    assert((output_data_type == fe::DataType_t::BFLOAT16) || (output_data_type == fe::DataType_t::HALF));
+    Surface<half> oTensor(b * s * h * d, false);
+
+    void* devPtrQ = qkvTensor.devPtr;
+    void* devPtrK = (qkvTensor.devPtr + h * d);
+    void* devPtrV = (qkvTensor.devPtr + 2 * h * d);
+    void* devPtrO = oTensor.devPtr;
+
+    Surface<float> descale_Q_Tensor(1, false);
+    Surface<float> descale_K_Tensor(1, false);
+    Surface<float> descale_V_Tensor(1, false);
+    Surface<float> descale_S_Tensor(1, false);
+    Surface<float> Amax_S_Tensor(1, false);
+    Surface<float> Amax_O_Tensor(1, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {Q, devPtrQ},
+        {K, devPtrK},
+        {V, devPtrV},
+        {O, devPtrO},
+        {descale_q, descale_Q_Tensor.devPtr},
+        {descale_k, descale_K_Tensor.devPtr},
+        {descale_v, descale_V_Tensor.devPtr},
+        {descale_s, descale_S_Tensor.devPtr},
+        {Amax_S, Amax_S_Tensor.devPtr},
+        {Amax_O, Amax_O_Tensor.devPtr}};
+
+    Surface<float> stats_tensor(b * h * s * 1, false);
+    if (generate_stats == true) {
+        variant_pack[Stats] = stats_tensor.devPtr;
+    }
+
+    int64_t workspace_size = 0;
+    REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+    Surface<int8_t> workspace(workspace_size, false);
+
+    REQUIRE(mha_graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/cpp/utils/helpers.h b/third_party/cudnn-frontend/samples/cpp/utils/helpers.h
new file mode 100644
index 00000000..9d0b4621
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/cpp/utils/helpers.h
@@ -0,0 +1,389 @@
+#pragma once
+
+#include <stdexcept>
+#include <sstream>
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+static int64_t
+div_up(int64_t x, int64_t y) {
+    return (x + y - 1) / y;
+}
+
+#define CUDA_CHECK(status)                                                                                    \
+    {                                                                                                         \
+        cudaError_t err = status;                                                                             \
+        if (err != cudaSuccess) {                                                                             \
+            std::stringstream err_msg;                                                                        \
+            err_msg << "CUDA Error: " << cudaGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" \
+                    << __LINE__;                                                                              \
+            FAIL(err_msg.str());                                                                              \
+        }                                                                                                     \
+    }
+
+#define CUDNN_CHECK(status)                                                                                     \
+    {                                                                                                           \
+        cudnnStatus_t err = status;                                                                             \
+        if (err != CUDNN_STATUS_SUCCESS) {                                                                      \
+            std::stringstream err_msg;                                                                          \
+            err_msg << "cuDNN Error: " << cudnnGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" \
+                    << __LINE__;                                                                                \
+            FAIL(err_msg.str());                                                                                \
+        }                                                                                                       \
+    }
+
+// Custom deleter for cudnnHandle_t
+struct CudnnHandleDeleter {
+    void
+    operator()(cudnnHandle_t* handle) const {
+        if (handle) {
+            CUDNN_CHECK(cudnnDestroy(*handle));
+            delete handle;
+        }
+    }
+};
+
+// Function to create a unique_ptr for cudnnHandle_t
+inline std::unique_ptr<cudnnHandle_t, CudnnHandleDeleter>
+create_cudnn_handle() {
+    auto handle = std::make_unique<cudnnHandle_t>();
+    CUDNN_CHECK(cudnnCreate(handle.get()));
+    return std::unique_ptr<cudnnHandle_t, CudnnHandleDeleter>(handle.release(), CudnnHandleDeleter());
+}
+
+inline size_t
+get_compute_capability() {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+    struct cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));
+    return prop.major * 10 + prop.minor;
+}
+
+inline bool
+is_ampere_arch() {
+    auto cc = get_compute_capability();
+    return (80 <= cc) && (cc < 89);
+}
+
+inline bool
+is_ada_arch() {
+    auto cc = get_compute_capability();
+    return (cc == 89);
+}
+
+inline bool
+is_hopper_arch() {
+    auto cc = get_compute_capability();
+    return (90 <= cc) && (cc < 100);
+}
+
+inline bool
+is_blackwell_arch() {
+    auto cc = get_compute_capability();
+    return (100 <= cc);
+}
+
+inline bool
+is_blackwell_computing_arch() {
+    auto cc = get_compute_capability();
+    return (100 <= cc && cc < 110);
+}
+
+inline bool
+is_arch_supported_by_cudnn() {
+    if (cudnnGetVersion() < 8600 && (is_hopper_arch() || is_ada_arch())) {
+        return false;
+    }
+    return true;
+}
+
+inline bool
+check_device_arch_newer_than(std::string const& arch) {
+    size_t arch_major = 6;
+    size_t arch_minor = 0;
+    if (arch == "blackwell") {
+        arch_major = 10;
+    }
+    if (arch == "hopper") {
+        arch_major = 9;
+    }
+    if (arch == "ampere") {
+        arch_major = 8;
+    }
+    if (arch == "turing") {
+        arch_major = 7;
+        arch_minor = 5;
+    }
+    if (arch == "volta") {
+        arch_major = 7;
+    }
+    if (arch == "pascal") {
+        arch_major = 6;
+    }
+
+    auto queried_version = arch_major * 10 + arch_minor;
+    if (get_compute_capability() >= queried_version) {
+        return true;
+    }
+    return false;
+}
+
+static half
+cpu_float2half_rn(float f) {
+    void* f_ptr = &f;
+    unsigned x  = *((int*)f_ptr);
+    unsigned u  = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned sign, exponent, mantissa;
+
+    __half_raw hr;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+        hr.x = 0x7fffU;
+        // Add an indirection to get around type aliasing check
+        void* hr_ptr = &hr;
+        return *reinterpret_cast<half*>(hr_ptr);
+    }
+
+    sign = ((x >> 16) & 0x8000);
+
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+        hr.x = static_cast<unsigned short>(sign | 0x7c00U);
+        // Add an indirection to get around type aliasing check
+        void* hr_ptr = &hr;
+        return *reinterpret_cast<half*>(hr_ptr);
+    }
+    if (u < 0x33000001) {
+        hr.x = static_cast<unsigned short>(sign | 0x0000U);
+        // Add an indirection to get around type aliasing check
+        void* hr_ptr = &hr;
+        return *reinterpret_cast<half*>(hr_ptr);
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    } else {
+        shift    = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb    = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }
+
+    hr.x = static_cast<unsigned short>((sign | (exponent << 10) | mantissa));
+
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half*>(hr_ptr);
+}
+
+static float
+cpu_half2float(half h) {
+    // Add an indirection to get around type aliasing check
+    void* h_ptr   = &h;
+    __half_raw hr = *reinterpret_cast<__half_raw*>(h_ptr);
+
+    unsigned sign     = ((hr.x >> 15) & 1);
+    unsigned exponent = ((hr.x >> 10) & 0x1f);
+    unsigned mantissa = ((hr.x & 0x3ff) << 13);
+
+    if (exponent == 0x1f) { /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    } else if (!exponent) { /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1; /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70;
+    }
+
+    int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+    // Add an indirection to get around type aliasing check
+    void* temp_ptr = &temp;
+    float* res_ptr = reinterpret_cast<float*>(temp_ptr);
+    return *res_ptr;
+}
+
+// Generate uniform numbers [0,1)
+static void
+initImage(float* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed         = (1103515245 * seed + 12345) & 0xffffffff;
+        image[index] = float(seed) * 2.3283064e-10f;  // 2^-32
+    }
+}
+
+static void
+initImage(half* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed         = (1103515245 * seed + 12345) & 0xffffffff;
+        image[index] = cpu_float2half_rn(float(seed) * 2.3283064e-10f);  // 2^-32
+    }
+}
+
+// Currently set to generate uniform integers [-2, 2] to avoid int8 overflow
+static void
+initImage(int8_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then subtracts from 2
+        image[index] = 2 - (int8_t)(5 * float(seed) * 2.3283064e-10f);  // 2^-32
+    }
+}
+
+// Currently set to generate random integers [0, 50] to avoid uint8 overflow
+static void
+initImage(uint8_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 50]
+        image[index] = (uint8_t)(50 * float(seed) * 2.3283064e-10f);  // 2^-32
+    }
+}
+
+// Currently set to generate uniform integers [0,1]
+static void
+initImage(int32_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then divides by 4
+        image[index] = ((int32_t)(5.f * float(seed) * 2.3283064e-10f)) / 4;  // 2^-32
+    }
+}
+
+// Currently set to generate uniform integers [0,1]
+static void
+initImage(int64_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then divides by 4
+        image[index] = ((int64_t)(5.f * float(seed) * 2.3283064e-10f)) / 4;  // 2^-32
+    }
+}
+
+// Currently set to generate booleans
+static void
+initImage(bool* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then divides by 4
+        int64_t val = ((int32_t)(5.f * float(seed) * 2.3283064e-10f)) / 4;  // 2^-32
+
+        // val is 0 or 1
+        image[index] = (val == 1);
+    }
+}
+
+template <typename T_ELEM>
+struct Surface {
+    T_ELEM* devPtr  = NULL;
+    T_ELEM* hostPtr = NULL;
+    int64_t n_elems = 0;
+
+   protected:
+    explicit Surface() {}
+
+   public:
+    explicit Surface(int64_t n_elems, [[maybe_unused]] bool hasRef) : n_elems(n_elems) {
+        CUDA_CHECK(cudaMalloc((void**)&(devPtr), (size_t)((n_elems) * sizeof(devPtr[0]))));
+        hostPtr = (T_ELEM*)calloc((size_t)n_elems, sizeof(hostPtr[0]));
+        initImage(hostPtr, n_elems);
+        CUDA_CHECK(cudaMemcpy(devPtr, hostPtr, size_t(sizeof(hostPtr[0]) * n_elems), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    explicit Surface(int64_t n_elems, [[maybe_unused]] bool hasRef, bool isInterleaved) {
+        (void)isInterleaved;
+        CUDA_CHECK(cudaMalloc((void**)&(devPtr), (n_elems) * sizeof(devPtr[0])));
+        hostPtr = (T_ELEM*)calloc(n_elems, sizeof(hostPtr[0]));
+        initImage(hostPtr, n_elems);
+        uint32_t* temp = (uint32_t*)hostPtr;
+        for (auto i = 0; i < n_elems; i = i + 2) {
+            temp[i + 1] = 1u;
+        }
+
+        CUDA_CHECK(cudaMemcpy(devPtr, hostPtr, size_t(sizeof(hostPtr[0]) * n_elems), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    explicit Surface(int64_t size, [[maybe_unused]] bool hasRef, T_ELEM fillValue) : n_elems(size) {
+        CUDA_CHECK(cudaMalloc((void**)&(devPtr), (size) * sizeof(devPtr[0])));
+        hostPtr = (T_ELEM*)calloc(size, sizeof(hostPtr[0]));
+        for (int i = 0; i < size; i++) {
+            hostPtr[i] = fillValue;
+        }
+        CUDA_CHECK(cudaMemcpy(devPtr, hostPtr, sizeof(hostPtr[0]) * n_elems, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    Surface(const Surface& other) : n_elems(other.n_elems) {
+        CUDA_CHECK(cudaMalloc((void**)&(devPtr), (size_t)((n_elems) * sizeof(devPtr[0]))));
+        hostPtr = (T_ELEM*)calloc((size_t)n_elems, sizeof(hostPtr[0]));
+        std::copy(other.hostPtr, other.hostPtr + n_elems, hostPtr);
+        CUDA_CHECK(cudaMemcpy(devPtr, hostPtr, size_t(sizeof(hostPtr[0]) * n_elems), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    Surface(Surface&& other) noexcept : Surface() { swap(*this, other); }
+
+    Surface&
+    operator=(Surface other) {
+        swap(*this, other);
+        return *this;
+    }
+
+    friend void
+    swap(Surface& first, Surface& second) {
+        std::swap(first.n_elems, second.n_elems);
+        std::swap(first.hostPtr, second.hostPtr);
+        std::swap(first.devPtr, second.devPtr);
+    }
+
+    ~Surface() {
+        if (devPtr) {
+            cudaFree(devPtr);
+            devPtr = nullptr;
+        }
+        if (hostPtr) {
+            free(hostPtr);
+            hostPtr = nullptr;
+        }
+    }
+};
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/CMakeLists.txt b/third_party/cudnn-frontend/samples/legacy_samples/CMakeLists.txt
new file mode 100644
index 00000000..3b56329a
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/CMakeLists.txt
@@ -0,0 +1,56 @@
+# target sources
+add_executable(
+    legacy_samples
+
+    conv_sample.cpp
+    test_list.cpp
+    fp16_emu.cpp
+    helpers.cpp
+    fusion_sample.cpp
+    fp8_sample.cpp
+    norm_samples.cpp
+    fused_mha_sample.cpp
+    f16_flash_mha_sample.cpp
+    fp8_flash_mha_sample.cpp
+)
+
+# target flags
+if(MSVC)
+    target_compile_options(
+        legacy_samples PRIVATE
+        /W4 /WX # warning level 3 and all warnings as errors
+        /wd4100 # allow unused parameters
+        /wd4458 # local hides class member (currently a problem for all inline setters)
+        /wd4505 # unreferenced function with internal linkage has been removed
+        /wd4101 /wd4189 # unreferenced local
+        /bigobj # increase number of sections in .Obj file
+    )
+else()
+    target_compile_options(
+        legacy_samples PRIVATE
+        -Wall
+        -Wextra
+        -Werror
+        -Wno-unused-function
+    )
+endif()
+
+# target links
+target_link_libraries(
+    legacy_samples PRIVATE
+    Threads::Threads
+    Catch2::Catch2WithMain
+    cudnn_frontend
+    _cudnn_frontend_pch
+    CUDNN::cudnn
+
+    CUDA::cublasLt
+    CUDA::cudart
+    CUDA::nvrtc
+)
+
+# target cmake properties
+set_target_properties(
+    legacy_samples PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin
+)
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/conv_sample.cpp b/third_party/cudnn-frontend/samples/legacy_samples/conv_sample.cpp
new file mode 100644
index 00000000..db50daa8
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/conv_sample.cpp
@@ -0,0 +1,1444 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "conv_sample.h"
+#include <cudnn_frontend_find_plan.h>
+#include <cudnn_frontend_get_plan.h>
+
+namespace {
+
+bool
+isNonDeterministic(cudnnBackendDescriptor_t engine_config) {
+    return cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(engine_config);
+}
+
+bool
+isReducedPrecisionReduction(cudnnBackendDescriptor_t engine_config) {
+    return cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION>(engine_config);
+}
+
+bool
+isDownConvertingInputs(cudnnBackendDescriptor_t engine_config) {
+    return cudnn_frontend::hasNumericalNote<CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(engine_config);
+}
+
+bool
+isNonDeterministicOrisDownConverting(cudnnBackendDescriptor_t engine_config) {
+    return isNonDeterministic(engine_config) || isDownConvertingInputs(engine_config);
+}
+
+bool
+allowAll(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+bool
+allowErrata(int64_t* padA) {
+    return std::all_of(padA, padA + 2, [](int64_t pad) { return pad == 0; });
+}
+
+bool
+isInt8Errata(cudnnDataType_t type) {
+    return type == CUDNN_DATA_INT8;
+}
+
+}  // namespace
+enum {
+    X_TENSOR,
+    Y_TENSOR,
+    W_TENSOR,
+    Z_TENSOR,
+    B_TENSOR,
+    AFTERADD_TENSOR,
+    AFTERBIAS_TENSOR,
+    AFTERCONV_TENSOR,
+};
+
+using common_conv_descriptors =
+    std::tuple<cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::Tensor, cudnn_frontend::ConvDesc>;
+
+using common_convbias_descriptors = std::tuple<cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor,
+                                               cudnn_frontend::Tensor>;
+
+common_convbias_descriptors
+create_conv_add_bias_act_descriptors(int64_t* x_dim,
+                                     int64_t* padA,
+                                     int64_t* convstrideA,
+                                     int64_t* dilationA,
+                                     int64_t* w_dim,
+                                     int64_t* y_dim,
+                                     cudnnDataType_t dataType,
+                                     cudnnDataType_t computeType) {
+    (void)padA;
+    (void)convstrideA;
+    (void)dilationA;
+    int64_t b_dim[4];
+    b_dim[0] = 1;
+    b_dim[1] = y_dim[1];
+    b_dim[2] = 1;
+    b_dim[3] = 1;
+
+    int64_t x_stride[4];
+    int64_t y_stride[4];
+    int64_t w_stride[4];
+    int64_t b_stride[4];
+
+    generateStrides(w_dim, w_stride, 4, CUDNN_TENSOR_NHWC);
+    generateStrides(x_dim, x_stride, 4, CUDNN_TENSOR_NHWC);
+    generateStrides(y_dim, y_stride, 4, CUDNN_TENSOR_NHWC);
+    generateStrides(b_dim, b_stride, 4, CUDNN_TENSOR_NHWC);
+
+    return common_convbias_descriptors(cudnn_frontend::TensorBuilder()
+                                           .setDim(4, x_dim)
+                                           .setStride(4, x_stride)
+                                           .setId('x')
+                                           .setAlignment(4)
+                                           .setDataType(dataType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, y_dim)
+                                           .setStride(4, y_stride)
+                                           .setId('y')
+                                           .setAlignment(4)
+                                           .setDataType(dataType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, w_dim)
+                                           .setStride(4, w_stride)
+                                           .setId('w')
+                                           .setAlignment(4)
+                                           .setDataType(dataType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, y_dim)
+                                           .setStride(4, y_stride)
+                                           .setId('z')
+                                           .setAlignment(4)
+                                           .setDataType(dataType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, b_dim)
+                                           .setStride(4, b_stride)
+                                           .setId('b')
+                                           .setAlignment(4)
+                                           .setDataType(dataType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, y_dim)
+                                           .setStride(4, y_stride)
+                                           .setVirtual()
+                                           .setId('A')  // after add
+                                           .setAlignment(4)
+                                           .setDataType(computeType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, y_dim)
+                                           .setStride(4, y_stride)
+                                           .setVirtual()
+                                           .setId('B')  // after bias
+                                           .setAlignment(4)
+                                           .setDataType(computeType)
+                                           .build(),
+                                       cudnn_frontend::TensorBuilder()
+                                           .setDim(4, y_dim)
+                                           .setStride(4, y_stride)
+                                           .setId('C')  // after conv
+                                           .setAlignment(4)
+                                           .setVirtual()
+                                           .setDataType(computeType)
+                                           .build());
+}
+
+common_conv_descriptors
+create_common_descriptors(int64_t* x_dim,
+                          int64_t* padA,
+                          int64_t* convstrideA,
+                          int64_t* dilationA,
+                          int64_t* w_dim,
+                          int64_t* y_dim,
+                          cudnnDataType_t dataType,
+                          cudnnConvolutionMode_t mode) {
+    const int convDim = 2;
+
+    int64_t strideA[4];
+    int64_t outstrideA[4];
+    int64_t filterstrideA[4];
+
+    generateStrides(w_dim, filterstrideA, 4, CUDNN_TENSOR_NCHW);
+    generateStrides(x_dim, strideA, 4, CUDNN_TENSOR_NCHW);
+    generateStrides(y_dim, outstrideA, 4, CUDNN_TENSOR_NCHW);
+
+    return common_conv_descriptors(cudnn_frontend::TensorBuilder()
+                                       .setDim(4, x_dim)
+                                       .setStride(4, strideA)
+                                       .setId('x')
+                                       .setAlignment(4)
+                                       .setDataType(dataType)
+                                       .build(),
+                                   cudnn_frontend::TensorBuilder()
+                                       .setDim(4, y_dim)
+                                       .setStride(4, outstrideA)
+                                       .setId('y')
+                                       .setAlignment(4)
+                                       .setDataType(dataType)
+                                       .build(),
+                                   cudnn_frontend::TensorBuilder()
+                                       .setDim(4, w_dim)
+                                       .setStride(4, filterstrideA)
+                                       .setId('w')
+                                       .setAlignment(4)
+                                       .setDataType(dataType)
+                                       .build(),
+                                   cudnn_frontend::ConvDescBuilder()
+                                       .setComputeType(dataType)
+                                       .setMathMode(mode)
+                                       .setSpatialDimCount(convDim)
+                                       .setSpatialStride(convDim, convstrideA)
+                                       .setPrePadding(convDim, padA)
+                                       .setPostPadding(convDim, padA)
+                                       .setDilation(convDim, dilationA)
+                                       .build());
+}
+
+cudnn_frontend::OperationGraph
+create_operation_graph(common_conv_descriptors& descriptors, cudnnBackendDescriptorType_t mode, cudnnHandle_t handle_) {
+    float alpha = 1.0f;
+    float beta  = 0.0;
+
+    auto op = cudnn_frontend::OperationBuilder(mode)
+                  .setxDesc(std::get<X_TENSOR>(descriptors))
+                  .setyDesc(std::get<Y_TENSOR>(descriptors))
+                  .setwDesc(std::get<W_TENSOR>(descriptors))
+                  .setcDesc(std::get<3>(descriptors))
+                  .setAlpha(alpha)
+                  .setBeta(beta)
+                  .build();
+
+    std::cout << "Operation is " << op.describe() << std::endl;
+
+    std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
+
+    return cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();
+}
+
+// Method for engine config generator based on heuristics
+auto heurgen_method = [](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList {
+    auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                          .setOperationGraph(opGraph)
+                          .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+                          .build();
+    std::cout << "Heuristic has " << heuristics.getEngineConfigCount() << " configurations " << std::endl;
+
+    auto& engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+    cudnn_frontend::EngineConfigList filtered_configs;
+    cudnn_frontend::filter(engine_configs, filtered_configs, ::allowAll);
+    return filtered_configs;
+};
+
+// Method for engine config generator based on fallback list
+auto fallback_method = [](cudnn_frontend::OperationGraph& opGraph) -> cudnn_frontend::EngineConfigList {
+    auto fallback = cudnn_frontend::EngineFallbackListBuilder()
+                        .setOperationGraph(opGraph)
+                        .setOperation(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                        .build();
+    auto& fallback_list = fallback.getFallbackList();
+
+    cudnn_frontend::EngineConfigList filtered_configs;
+    // We create this filter to pre-remove configs being passed to cudnnFind.
+    // This is just a sample and is not necessary
+    cudnn_frontend::filter(fallback_list, filtered_configs, ::isNonDeterministic);
+
+    return filtered_configs;
+};
+
+void
+run_from_heuristics(int64_t* x_dim,
+                    int64_t* padA,
+                    int64_t* convstrideA,
+                    int64_t* dilationA,
+                    int64_t* w_dim,
+                    int64_t* y_dim,
+                    cudnnDataType_t dataType,
+                    cudnnConvolutionMode_t mode,
+                    float* devPtrX,
+                    float* devPtrW,
+                    float* devPtrY,
+                    cudnnBackendHeurMode_t heur_mode,
+                    bool expect_in_cache) {
+    (void)heur_mode;
+    static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        common_conv_descriptors descriptors =
+            create_common_descriptors(x_dim, padA, convstrideA, dilationA, w_dim, y_dim, dataType, mode);
+
+        std::cout << std::get<X_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<3>(descriptors).describe() << std::endl;
+
+        auto opGraph =
+            create_operation_graph(descriptors, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, handle_);
+        std::cout << opGraph.describe() << std::endl;
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+        int64_t uids[]    = {'x', 'y', 'w'};
+
+        const cudnn_frontend::ExecutionPlan* cached_plan;
+        if (plan_cache.get_plan_from_cache(opGraph, cached_plan)) {
+            std::cout << "Cached execution plan found." << cached_plan->getTag() << std::endl;
+            auto workspace_size = cached_plan->getWorkspaceSize();
+            std::cout << cached_plan->describe() << " requires workspace " << workspace_size << std::endl;
+            void* workspace_ptr = nullptr;
+            if (workspace_size > 0) {
+                checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+            }
+            auto variantPack = cudnn_frontend::VariantPackBuilder()
+                                   .setWorkspacePointer(workspace_ptr)
+                                   .setDataPointers(3, data_ptrs)
+                                   .setUids(3, uids)
+                                   .build();
+            std::cout << "variantPack " << variantPack.describe() << std::endl;
+            cudnnStatus_t status =
+                cudnnBackendExecute(handle_, cached_plan->get_raw_desc(), variantPack.get_raw_desc());
+
+            if (workspace_size > 0) {
+                checkCudaErr(cudaFree(workspace_ptr));
+            }
+            cudnn_frontend::throw_if(
+                [status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        } else {
+            REQUIRE(false == expect_in_cache);
+            std::array<cudnn_frontend::GeneratorSource const, 1> sources = {heurgen_method};
+            cudnn_frontend::EngineConfigGenerator generator(static_cast<int>(sources.size()), sources.data());
+
+            auto workspace_size = 100 * 1024 * 1024;  // 100 MB
+            void* workspace_ptr = nullptr;
+            if (workspace_size > 0) {
+                checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+            }
+
+            auto variantPack = cudnn_frontend::VariantPackBuilder()
+                                   .setWorkspacePointer(workspace_ptr)
+                                   .setDataPointers(3, data_ptrs)
+                                   .setUids(3, uids)
+                                   .build();
+            std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+            auto plan = generator.cudnnFindPlanAndCache<
+                cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE>(
+                handle_, opGraph, variantPack, plan_cache);
+
+            std::cout << "Plan tag: " << plan.getTag() << " finished in " << plan.getExecutionTime() << " ms,"
+                      << " workspace: " << plan.getWorkspaceSize() << " bytes" << std::endl;
+
+            cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+
+            if (workspace_size > 0) {
+                checkCudaErr(cudaFree(workspace_ptr));
+            }
+            cudnn_frontend::throw_if(
+                [status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        }
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+    return;
+}
+
+void
+run_from_global_index(int64_t* x_dim,
+                      int64_t* padA,
+                      int64_t* convstrideA,
+                      int64_t* dilationA,
+                      int64_t* w_dim,
+                      int64_t* y_dim,
+                      cudnnDataType_t dataType,
+                      cudnnConvolutionMode_t mode,
+                      float* devPtrX,
+                      float* devPtrW,
+                      float* devPtrY) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        common_conv_descriptors descriptors =
+            create_common_descriptors(x_dim, padA, convstrideA, dilationA, w_dim, y_dim, dataType, mode);
+
+        std::cout << std::get<X_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<3>(descriptors).describe() << std::endl;
+
+        auto opGraph = create_operation_graph(
+            descriptors, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, handle_);
+        std::cout << opGraph.describe() << std::endl;
+
+        // We have to randomly pick one engine from [0, total_engines)
+        // Selecting "0" by default
+        auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
+        std::cout << engine.describe() << std::endl;
+        auto& knobs = engine.getSupportedKnobs();
+        for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
+            std::cout << it->describe() << std::endl;
+        }
+
+        if (knobs.begin() != knobs.end()) {
+            std::cout << "Updated knob choice" << std::endl;
+            knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
+            std::cout << knobs.begin()->describe() << std::endl;
+        }
+        auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
+        std::cout << engine_config.describe() << std::endl;
+        auto plan = cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+        int64_t uids[]    = {'x', 'y', 'w'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(nullptr)
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+cudnnStatus_t
+run_with_external_config(int64_t* x_dim,
+                         int64_t* padA,
+                         int64_t* convstrideA,
+                         int64_t* dilationA,
+                         int64_t* w_dim,
+                         int64_t* y_dim,
+                         cudnnDataType_t dataType,
+                         cudnnConvolutionMode_t mode,
+                         float* devPtrX,
+                         float* devPtrW,
+                         float* devPtrY) {
+    cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+    try {  // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        common_conv_descriptors descriptors =
+            create_common_descriptors(x_dim, padA, convstrideA, dilationA, w_dim, y_dim, dataType, mode);
+
+        std::cout << std::get<X_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<3>(descriptors).describe() << std::endl;
+
+        auto opGraph =
+            create_operation_graph(descriptors, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, handle_);
+        std::cout << opGraph.describe() << std::endl;
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::isNonDeterministic, filtered_configs);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        cudnn_frontend::ManagedOpaqueDescriptor plan_desc = nullptr;
+        int64_t workspace_size                            = 0;
+        for (auto& config : filtered_configs) {
+            try {
+                auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                .setHandle(handle_)
+                                .setEngineConfig(config, opGraph.getTag())
+                                .build();
+                std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+                workspace_size = plan.getWorkspaceSize();
+                std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+                plan_desc = plan.get_desc();
+            } catch (cudnn_frontend::cudnnException& e) {
+                status = e.getCudnnStatus();
+                continue;
+            }
+        }
+        if (plan_desc == nullptr) {
+            std::cout << "No plan found implementing the operation graph" << std::endl;
+            return status;
+        }
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+        int64_t uids[]    = {'x', 'y', 'w'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        status = cudnnBackendExecute(handle_, plan_desc->get_backend_descriptor(), variantPack.get_raw_desc());
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << " " << cudnn_frontend::to_string(e.getCudnnStatus())
+                  << std::endl;
+        CHECK(false);
+    }
+
+    return status;
+}
+
+// create_plan(std::vector<cudnnBackendDescriptor_t> &)
+void
+run_conv_add_bias_activation(int64_t* x_dim,
+                             int64_t* pad,
+                             int64_t* convstride,
+                             int64_t* dilation,
+                             int64_t* w_dim,
+                             int64_t* y_dim,
+                             cudnnDataType_t dataType,
+                             float* devPtrX,
+                             float* devPtrW,
+                             float* devPtrY,
+                             float* devPtrZ,
+                             float* devPtrB) {
+    try {
+        int convDim = 2;
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        common_convbias_descriptors tensors = create_conv_add_bias_act_descriptors(
+            x_dim, pad, convstride, dilation, w_dim, y_dim, dataType, CUDNN_DATA_FLOAT);
+        std::cout << std::get<X_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<Z_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<B_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<AFTERADD_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<AFTERBIAS_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<AFTERCONV_TENSOR>(tensors).describe() << std::endl;
+
+        // Define the add operation
+        auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_ADD)
+                           .setMathPrecision(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << addDesc.describe() << std::endl;
+
+        // Define the bias operation
+        auto addDesc2 = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setMathPrecision(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << addDesc2.describe() << std::endl;
+
+        // Define the activation operation
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setMathPrecision(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CONVOLUTION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, convstride)
+                            .setPrePadding(convDim, pad)
+                            .setPostPadding(convDim, pad)
+                            .setDilation(convDim, dilation)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha  = 1.0f;
+        float alpha2 = 0.5f;
+        float beta   = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(std::get<X_TENSOR>(tensors))
+                           .setwDesc(std::get<W_TENSOR>(tensors))
+                           .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Add Node with scaling parameters.
+        auto add_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(conv_op.getOutputTensor())
+                           .setbDesc(std::get<Z_TENSOR>(tensors))
+                           .setyDesc(std::get<AFTERADD_TENSOR>(tensors))
+                           .setpwDesc(addDesc)
+                           .setAlpha(alpha)
+                           .setAlpha2(alpha2)
+                           .build();
+        std::cout << add_op1.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto add_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(add_op1.getOutputTensor())
+                           .setbDesc(std::get<B_TENSOR>(tensors))
+                           .setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
+                           .setpwDesc(addDesc2)
+                           .build();
+        std::cout << add_op2.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(add_op2.getOutputTensor())
+                          .setyDesc(std::get<Y_TENSOR>(tensors))
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution add bias activation
+        std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op1, &add_op2, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // How many engines support this operation graph ?
+        auto total_engines = opGraph.getEngineCount();
+        std::cout << "conv_add_bias_activation " << opGraph.describe() << " has " << total_engines << " engines."
+                  << std::endl;
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::isNonDeterministic, filtered_configs);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        for (auto& filtered_config : filtered_configs) {
+            try {
+                auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                .setHandle(handle_)
+                                .setEngineConfig(filtered_config, opGraph.getTag())
+                                .build();
+                std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+                auto workspace_size = plan.getWorkspaceSize();
+                std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+                void* workspace_ptr = nullptr;
+                if (workspace_size > 0) {
+                    checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+                }
+                void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB};
+                int64_t uids[]    = {'x', 'y', 'w', 'z', 'b'};
+                auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                                       .setWorkspacePointer(workspace_ptr)
+                                       .setDataPointers(5, data_ptrs)
+                                       .setUids(5, uids)
+                                       .build();
+                std::cout << "variantPack " << variantPack.describe() << std::endl;
+                cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+                if (workspace_size > 0) {
+                    checkCudaErr(cudaFree(workspace_ptr));
+                }
+                cudnn_frontend::throw_if(
+                    [status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+                std::cout << "Test completed succesfully" << std::endl;
+                return;
+            } catch (cudnn_frontend::cudnnException& e) {
+                continue;
+            }
+        }
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_from_cudnn_find(int64_t* x_dim,
+                    int64_t* padA,
+                    int64_t* convstrideA,
+                    int64_t* dilationA,
+                    int64_t* w_dim,
+                    int64_t* y_dim,
+                    cudnnDataType_t dataType,
+                    cudnnConvolutionMode_t mode,
+                    void* devPtrX,
+                    void* devPtrW,
+                    void* devPtrY) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        common_conv_descriptors descriptors =
+            create_common_descriptors(x_dim, padA, convstrideA, dilationA, w_dim, y_dim, dataType, mode);
+
+        std::cout << std::get<X_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<3>(descriptors).describe() << std::endl;
+
+        auto opGraph =
+            create_operation_graph(descriptors, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, handle_);
+        std::cout << opGraph.describe() << std::endl;
+
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+        int64_t uids[]    = {'x', 'y', 'w'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder().setDataPointers(3, data_ptrs).setUids(3, uids).build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        auto sample_predicate_function = [](cudnn_frontend::ExecutionPlan const& plan) -> bool {
+            const int32_t max_plan_count = 5;
+            static int32_t plan_count    = 0;
+
+            // Filter out plans that require non-zero workspace
+            if (plan.getWorkspaceSize() != 0) return true;
+
+            plan_count++;
+
+            // Only keep first 5 plans
+            return plan_count > max_plan_count;
+        };
+
+        std::array<cudnn_frontend::GeneratorSource const, 2> sources = {heurgen_method, fallback_method};
+        cudnn_frontend::EngineConfigGenerator generator(static_cast<int>(sources.size()), sources.data());
+
+        auto options =
+            generator.cudnnFindPlan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE>(
+                handle_, opGraph, variantPack, sample_predicate_function);
+
+        std::for_each(options.begin(), options.end(), [](cudnn_frontend::ExecutionPlan& opt) {
+            std::cout << "Plan tag: " << opt.getTag() << " finished in " << opt.getExecutionTime() << " ms,"
+                      << " workspace: " << opt.getWorkspaceSize() << " bytes" << std::endl;
+        });
+
+        cudnnStatus_t status = cudnnBackendExecute(handle_, options.front().get_raw_desc(), variantPack.get_raw_desc());
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+
+    return;
+}
+
+void
+run_conv_add_bias_activation_with_cudnn_find(int64_t* x_dim,
+                                             int64_t* pad,
+                                             int64_t* convstride,
+                                             int64_t* dilation,
+                                             int64_t* w_dim,
+                                             int64_t* y_dim,
+                                             cudnnDataType_t dataType,
+                                             float* devPtrX,
+                                             float* devPtrW,
+                                             float* devPtrY,
+                                             float* devPtrZ,
+                                             float* devPtrB) {
+    try {
+        int convDim = 2;
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        common_convbias_descriptors tensors = create_conv_add_bias_act_descriptors(
+            x_dim, pad, convstride, dilation, w_dim, y_dim, dataType, CUDNN_DATA_FLOAT);
+        std::cout << std::get<X_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<Z_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<B_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<AFTERADD_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<AFTERBIAS_TENSOR>(tensors).describe() << std::endl;
+        std::cout << std::get<AFTERCONV_TENSOR>(tensors).describe() << std::endl;
+
+        // Define the add operation
+        auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_ADD)
+                           .setMathPrecision(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << addDesc.describe() << std::endl;
+
+        // Define the bias operation
+        auto addDesc2 = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setMathPrecision(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << addDesc2.describe() << std::endl;
+
+        // Define the activation operation
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setMathPrecision(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CONVOLUTION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, convstride)
+                            .setPrePadding(convDim, pad)
+                            .setPostPadding(convDim, pad)
+                            .setDilation(convDim, dilation)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha  = 1.0f;
+        float alpha2 = 0.5f;
+        float beta   = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(std::get<X_TENSOR>(tensors))
+                           .setwDesc(std::get<W_TENSOR>(tensors))
+                           .setyDesc(std::get<AFTERCONV_TENSOR>(tensors))
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Add Node with scaling parameters.
+        auto add_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(conv_op.getOutputTensor())
+                           .setbDesc(std::get<Z_TENSOR>(tensors))
+                           .setyDesc(std::get<AFTERADD_TENSOR>(tensors))
+                           .setpwDesc(addDesc)
+                           .setAlpha(alpha)
+                           .setAlpha2(alpha2)
+                           .build();
+        std::cout << add_op1.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto add_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(add_op1.getOutputTensor())
+                           .setbDesc(std::get<B_TENSOR>(tensors))
+                           .setyDesc(std::get<AFTERBIAS_TENSOR>(tensors))
+                           .setpwDesc(addDesc2)
+                           .build();
+        std::cout << add_op2.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(add_op2.getOutputTensor())
+                          .setyDesc(std::get<Y_TENSOR>(tensors))
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution add bias activation
+        std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &add_op1, &add_op2, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto max_workspace_size = 10 * 1024 * 1024;  // 10 MiB
+        void* workspace_ptr     = nullptr;
+        checkCudaErr(cudaMalloc(&workspace_ptr, max_workspace_size));
+
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB};
+        int64_t uids[]    = {'x', 'y', 'w', 'z', 'b'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(5, data_ptrs)
+                               .setUids(5, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        auto sample_predicate_function = [=](cudnn_frontend::ExecutionPlan const& plan) -> bool {
+            return plan.getWorkspaceSize() > max_workspace_size;
+        };
+
+        std::array<cudnn_frontend::GeneratorSource const, 2> sources = {heurgen_method, fallback_method};
+        cudnn_frontend::EngineConfigGenerator generator(static_cast<int>(sources.size()), sources.data());
+
+        auto options =
+            generator.cudnnFindPlan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE>(
+                handle_, opGraph, variantPack, sample_predicate_function);
+
+        std::for_each(options.begin(), options.end(), [](cudnn_frontend::ExecutionPlan& opt) {
+            std::cout << "Plan tag: " << opt.getTag() << " finished in " << opt.getExecutionTime() << " ms,"
+                      << " workspace: " << opt.getWorkspaceSize() << " bytes" << std::endl;
+        });
+
+        cudnnStatus_t status = cudnnBackendExecute(handle_, options.front().get_raw_desc(), variantPack.get_raw_desc());
+
+        checkCudaErr(cudaFree(workspace_ptr));
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_from_cudnn_get(int64_t* x_dim,
+                   int64_t* padA,
+                   int64_t* convstrideA,
+                   int64_t* dilationA,
+                   int64_t* w_dim,
+                   int64_t* y_dim,
+                   cudnnDataType_t dataType,
+                   cudnnConvolutionMode_t mode,
+                   float* devPtrX,
+                   float* devPtrW,
+                   float* devPtrY) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        common_conv_descriptors descriptors =
+            create_common_descriptors(x_dim, padA, convstrideA, dilationA, w_dim, y_dim, dataType, mode);
+
+        std::cout << std::get<X_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<3>(descriptors).describe() << std::endl;
+
+        auto opGraph =
+            create_operation_graph(descriptors, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, handle_);
+        std::cout << opGraph.describe() << std::endl;
+
+        auto sample_predicate_function = [](cudnn_frontend::ExecutionPlan const& plan) -> bool {
+            (void)plan;
+            return false;
+        };
+
+        std::array<cudnn_frontend::GeneratorSource const, 1> sources = {heurgen_method};
+        cudnn_frontend::EngineConfigGenerator generator(static_cast<int>(sources.size()), sources.data());
+
+        auto plans = generator.cudnnGetPlan(handle_, opGraph, sample_predicate_function);
+
+        int64_t max_workspace_size = 0u;
+        std::for_each(plans.begin(), plans.end(), [&max_workspace_size](cudnn_frontend::ExecutionPlan& plan) {
+            std::cout << "Plan tag: " << plan.getTag() << " workspace: " << plan.getWorkspaceSize() << " bytes"
+                      << std::endl;
+            if (plan.getWorkspaceSize() > max_workspace_size) {
+                max_workspace_size = plan.getWorkspaceSize();
+            }
+        });
+
+        std::cout << "Max workspace size required " << max_workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)max_workspace_size));
+
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+        int64_t uids[]    = {'x', 'y', 'w'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        // This is an optional step in this test.
+        // time_sorted_plan makes this equivalent to using find for autotuning, and this step is not necessary if the
+        // intent is to just use the heuristics.
+        auto options = cudnn_frontend::time_sorted_plan<
+            cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE>(
+            handle_, std::move(plans), variantPack);
+
+        std::for_each(options.begin(), options.end(), [](cudnn_frontend::ExecutionPlan& opt) {
+            std::cout << "Plan tag: " << opt.getTag() << " finished in " << opt.getExecutionTime() << " ms,"
+                      << " workspace: " << opt.getWorkspaceSize() << " bytes" << std::endl;
+        });
+
+        cudnnStatus_t status = cudnnBackendExecute(handle_, options.front().get_raw_desc(), variantPack.get_raw_desc());
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+
+    return;
+}
+
+void
+block_using_errata(int64_t* x_dim,
+                   int64_t* padA,
+                   int64_t* convstrideA,
+                   int64_t* dilationA,
+                   int64_t* w_dim,
+                   int64_t* y_dim,
+                   cudnnDataType_t dataType,
+                   cudnnConvolutionMode_t mode,
+                   float* devPtrX,
+                   float* devPtrW,
+                   float* devPtrY) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        common_conv_descriptors descriptors =
+            create_common_descriptors(x_dim, padA, convstrideA, dilationA, w_dim, y_dim, dataType, mode);
+
+        (void)devPtrX;
+        (void)devPtrY;
+        (void)devPtrW;
+
+        std::cout << std::get<X_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<Y_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<W_TENSOR>(descriptors).describe() << std::endl;
+        std::cout << std::get<3>(descriptors).describe() << std::endl;
+
+        auto opGraph = create_operation_graph(
+            descriptors, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, handle_);
+        std::cout << opGraph.describe() << std::endl;
+
+        // We have to randomly pick one engine from [0, total_engines)
+        // Selecting "0" by default
+        auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(20).setOperationGraph(opGraph).build();
+        std::cout << engine.describe() << std::endl;
+        auto& knobs = engine.getSupportedKnobs();
+        for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
+            std::cout << it->describe() << std::endl;
+        }
+
+        if (knobs.begin() != knobs.end()) {
+            std::cout << "Updated knob choice" << std::endl;
+            knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
+            std::cout << knobs.begin()->describe() << std::endl;
+        }
+        auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
+        std::cout << engine_config.describe() << std::endl;
+        auto plan = cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        /// Please note that the json string mentioned below is just an example and is
+        /// not actually a buggy engine config (kernel).
+        auto json_handle = json::parse(R"(
+            { "version" : 1, 
+              "rules"   : 
+                [ 
+                    { "rule_id"             : "ConvBwdData_eng1_k2=2_k3=0", 
+                      "operation"           : "ConvBwdData",
+                      "engine"              : 1, 
+                      "knob"                : ["k2=4", "k3=0"],
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : -1 
+                    }, 
+                    { "rule_id"             : "ConvBwdFilter_eng20",
+                      "operation"           : "ConvBwdFilter",
+                      "engine"              : 20,
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : -1 
+                    } 
+                ] 
+            })");
+
+        auto fn              = std::bind(::allowErrata, padA);
+        bool is_plan_blocked = cudnn_frontend::check_errata<decltype(fn)>(json_handle, plan.getTag(), handle_, fn);
+        CHECK(is_plan_blocked);
+
+        // Filter kernels with specific shape
+        auto json_handle_with_shape = json::parse(R"(
+            { "version" : 1, 
+              "rules"   : 
+                [ 
+                    { "rule_id"             : "ConvBwdData_eng1_k2=2_k3=0", 
+                      "operation"           : "ConvBwdData",
+                      "engine"              : 1, 
+                      "knob"                : ["k2=4", "k3=0"],
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : -1 
+                    }, 
+                    { "rule_id"             : "ConvBwdFilter_eng20",
+                      "operation"           : "ConvBwdFilter",
+                      "engine"              : 20,
+                      "shape_format"        : "NCHW",
+                      "input_shape"         : [1, 32, 128, 128],
+                      "filter_shape"        : [32, 32, 3, 3],
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : -1 
+                    } 
+                ] 
+            })");
+
+        is_plan_blocked =
+            cudnn_frontend::check_errata<decltype(fn)>(json_handle_with_shape, plan.getTag(), handle_, opGraph, fn);
+        CHECK(is_plan_blocked);
+
+        // Filter kernels only on spatial dims (wildcard usage)
+        auto json_handle_with_shape_wildcards = json::parse(R"(
+            { "version" : 1, 
+              "rules"   : 
+                [ 
+                    { "rule_id"             : "ConvBwdData_eng1_k2=2_k3=0", 
+                      "operation"           : "ConvBwdData",
+                      "engine"              : 1, 
+                      "knob"                : ["k2=4", "k3=0"],
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : -1 
+                    }, 
+                    { "rule_id"             : "ConvBwdFilter_eng20",
+                      "operation"           : "ConvBwdFilter",
+                      "engine"              : 20,
+                      "shape_format"        : "NCHW",
+                      "input_shape"         : [-1, -1, 128, 128],
+                      "filter_shape"        : [-1, -1, 3, 3],
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : -1 
+                    } 
+                ] 
+            })");
+
+        is_plan_blocked = cudnn_frontend::check_errata<decltype(fn)>(
+            json_handle_with_shape_wildcards, plan.getTag(), handle_, opGraph, fn);
+        CHECK(is_plan_blocked);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_dp4a(int64_t* x_dim,
+         int64_t* padA,
+         int64_t* convstrideA,
+         int64_t* dilationA,
+         int64_t* w_dim,
+         int64_t* y_dim,
+         cudnnConvolutionMode_t mode,
+         void* devPtrX,
+         void* devPtrW,
+         void* devPtrY,
+         int64_t vectorCount,
+         int64_t vectorDimension) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr   = create_cudnn_handle();
+        auto handle_      = *handle_ptr;
+        const int convDim = 2;
+        (void)convDim;
+
+        int64_t strideA[4];
+        int64_t outstrideA[4];
+        int64_t filterstrideA[4];
+
+        generateStrides(w_dim, filterstrideA, 4, CUDNN_TENSOR_NCHW);
+        generateStrides(x_dim, strideA, 4, CUDNN_TENSOR_NCHW);
+        generateStrides(y_dim, outstrideA, 4, CUDNN_TENSOR_NCHW);
+
+        auto tensor_x = cudnn_frontend::TensorBuilder()
+                            .setDim(4, x_dim)
+                            .setStride(4, strideA)
+                            .setId('x')
+                            .setAlignment(16)
+                            .setDataType(CUDNN_DATA_INT8)
+                            .setVectorCountAndDimension(vectorCount, vectorDimension)
+                            .build();
+        auto tensor_y = cudnn_frontend::TensorBuilder()
+                            .setDim(4, y_dim)
+                            .setStride(4, outstrideA)
+                            .setId('y')
+                            .setAlignment(16)
+                            .setDataType(CUDNN_DATA_INT8)
+                            .setVectorCountAndDimension(vectorCount, vectorDimension)
+                            .build();
+        auto tensor_w = cudnn_frontend::TensorBuilder()
+                            .setDim(4, w_dim)
+                            .setStride(4, filterstrideA)
+                            .setId('w')
+                            .setAlignment(16)
+                            .setDataType(CUDNN_DATA_INT8)
+                            .setVectorCountAndDimension(vectorCount, vectorDimension)
+                            .build();
+        auto conv_desc = cudnn_frontend::ConvDescBuilder()
+                             .setComputeType(CUDNN_DATA_INT32)
+                             .setMathMode(mode)
+                             .setSpatialDimCount(convDim)
+                             .setSpatialStride(convDim, convstrideA)
+                             .setPrePadding(convDim, padA)
+                             .setPostPadding(convDim, padA)
+                             .setDilation(convDim, dilationA)
+                             .build();
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+        auto op     = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                      .setxDesc(tensor_x)
+                      .setyDesc(tensor_y)
+                      .setwDesc(tensor_w)
+                      .setcDesc(conv_desc)
+                      .setAlpha(alpha)
+                      .setBeta(beta)
+                      .build();
+        std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        auto max_workspace_size = 1024 * 1024 * 1024;  // 1 GB
+        void* workspace_ptr     = nullptr;
+        checkCudaErr(cudaMalloc(&workspace_ptr, max_workspace_size));
+
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+        int64_t uids[]    = {'x', 'y', 'w'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .setWorkspacePointer(workspace_ptr)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        auto sample_predicate_function = [max_workspace_size](cudnn_frontend::ExecutionPlan const& plan) -> bool {
+            return plan.getWorkspaceSize() > max_workspace_size;
+        };
+
+        std::array<cudnn_frontend::GeneratorSource const, 2> sources = {heurgen_method, fallback_method};
+        cudnn_frontend::EngineConfigGenerator generator(static_cast<int>(sources.size()), sources.data());
+
+        auto options =
+            generator.cudnnFindPlan<cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_MEDIAN_OF_THREE>(
+                handle_, opGraph, variantPack, sample_predicate_function);
+
+        std::for_each(options.begin(), options.end(), [](cudnn_frontend::ExecutionPlan& opt) {
+            std::cout << "Plan tag: " << opt.getTag() << " finished in " << opt.getExecutionTime() << " ms,"
+                      << " workspace: " << opt.getWorkspaceSize() << " bytes" << std::endl;
+        });
+
+        cudnnStatus_t status = cudnnBackendExecute(handle_, options.front().get_raw_desc(), variantPack.get_raw_desc());
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_imma(int64_t* x_dim_padded,
+         int64_t* padA,
+         int64_t* convstrideA,
+         int64_t* dilationA,
+         int64_t* w_dim_padded,
+         int64_t* y_dim_padded,
+         cudnnConvolutionMode_t mode,
+         void* devPtrX,
+         void* devPtrW,
+         void* devPtrY,
+         int64_t vectorCount,
+         int64_t vectorDimension) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr   = create_cudnn_handle();
+        auto handle_      = *handle_ptr;
+        const int convDim = 2;
+        (void)convDim;
+
+        int64_t strideA_padded[4];
+        int64_t outstrideA_padded[4];
+        int64_t filterstrideA_padded[4];
+
+        generateStrides(w_dim_padded, filterstrideA_padded, 4, CUDNN_TENSOR_NCHW);
+        generateStrides(x_dim_padded, strideA_padded, 4, CUDNN_TENSOR_NCHW);
+        generateStrides(y_dim_padded, outstrideA_padded, 4, CUDNN_TENSOR_NCHW);
+
+        auto tensor_x = cudnn_frontend::TensorBuilder()
+                            .setDim(4, x_dim_padded)
+                            .setStride(4, strideA_padded)
+                            .setId('x')
+                            .setAlignment(16)
+                            .setDataType(CUDNN_DATA_INT8)
+                            .setVectorCountAndDimension(vectorCount, vectorDimension)
+                            .build();
+        auto tensor_y = cudnn_frontend::TensorBuilder()
+                            .setDim(4, y_dim_padded)
+                            .setStride(4, outstrideA_padded)
+                            .setId('y')
+                            .setAlignment(16)
+                            .setDataType(CUDNN_DATA_INT8)
+                            .setVectorCountAndDimension(vectorCount, vectorDimension)
+                            .build();
+        auto tensor_w = cudnn_frontend::TensorBuilder()
+                            .setDim(4, w_dim_padded)
+                            .setStride(4, filterstrideA_padded)
+                            .setId('w')
+                            .setAlignment(16)
+                            .setDataType(CUDNN_DATA_INT8)
+                            .setReorderType(cudnn_frontend::TensorReordering_t::INT8x32)
+                            .setVectorCountAndDimension(vectorCount, vectorDimension)
+                            .build();
+        auto conv_desc = cudnn_frontend::ConvDescBuilder()
+                             .setComputeType(CUDNN_DATA_INT32)
+                             .setMathMode(mode)
+                             .setSpatialDimCount(convDim)
+                             .setSpatialStride(convDim, convstrideA)
+                             .setPrePadding(convDim, padA)
+                             .setPostPadding(convDim, padA)
+                             .setDilation(convDim, dilationA)
+                             .build();
+        std::cout << tensor_x.describe() << std::endl;
+        std::cout << tensor_w.describe() << std::endl;
+        std::cout << tensor_y.describe() << std::endl;
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+        auto op     = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                      .setxDesc(tensor_x)
+                      .setyDesc(tensor_y)
+                      .setwDesc(tensor_w)
+                      .setcDesc(conv_desc)
+                      .setAlpha(alpha)
+                      .setBeta(beta)
+                      .build();
+        std::array<cudnn_frontend::Operation const*, 1> ops = {&op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        auto max_workspace_size = 1024 * 1024 * 1024;  // 1 GB
+        void* workspace_ptr     = nullptr;
+        checkCudaErr(cudaMalloc(&workspace_ptr, max_workspace_size));
+
+        auto engine_configs_h = heurgen_method(opGraph);
+        auto engine_configs_f = fallback_method(opGraph);
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        cudnn_frontend::filter(engine_configs_h, filtered_configs, ::allowAll);
+        cudnn_frontend::filter(engine_configs_f, filtered_configs, ::allowAll);
+        std::cout << "filtered_configs " << filtered_configs.size() << std::endl;
+
+        cudnn_frontend::executionPlans_t options;
+        for (auto& cfg : filtered_configs) {
+            try {
+                options.emplace_back(cudnn_frontend::ExecutionPlanBuilder()
+                                         .setHandle(handle_)
+                                         .setEngineConfig(cfg, opGraph.getTag())
+                                         .build());
+            } catch (cudnn_frontend::cudnnException&) {
+                continue;
+            }
+        }
+
+        std::for_each(options.begin(), options.end(), [](cudnn_frontend::ExecutionPlan& opt) {
+            std::cout << "Plan tag: " << opt.getTag() << " finished in " << opt.getExecutionTime() << " ms,"
+                      << " workspace: " << opt.getWorkspaceSize() << " bytes." << std::endl;
+        });
+
+        int64_t filter_size = tensor_w.getPackedElementCount();
+        void* reorderedData = nullptr;
+
+        auto cuda_status = cudaMalloc(&reorderedData, (size_t)filter_size);
+        CHECK(cuda_status == cudaSuccess);
+
+        auto reorder_status = cudnn_frontend::cudnnReorderFilterAndBiasInt8x32(
+            handle_, tensor_w, conv_desc, devPtrW, reorderedData, nullptr, nullptr);
+        CHECK(reorder_status == CUDNN_STATUS_SUCCESS);
+
+        void* data_ptrs[] = {devPtrX, devPtrY, reorderedData};
+        int64_t uids[]    = {'x', 'y', 'w'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .setWorkspacePointer(workspace_ptr)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        if (options.size() == 0) {
+            return;
+        }
+
+        auto json_handle = json::parse(R"(
+            { "version" : 1, 
+              "rules"   : 
+                [ 
+                    { "rule_id"             : "ConvFwd_eng0", 
+                      "operation"           : "ConvFwd",
+                      "engine"              : 0, 
+                      "knob"                : [],
+                      "cudnn_version_start" : 8000, 
+                      "cudnn_version_end"   : 8300 
+                    }
+                ] 
+            })");
+
+        auto fn = std::bind(::isInt8Errata, CUDNN_DATA_INT8);
+
+        cudnnStatus_t status = CUDNN_STATUS_SUCCESS;
+
+        for (auto& option : options) {
+            bool is_plan_blocked =
+                cudnn_frontend::check_errata<decltype(fn)>(json_handle, option.getTag(), handle_, fn);
+            if (is_plan_blocked) {
+                continue;
+            }
+
+            std::cout << "Executing " << option.getTag() << std::endl;
+            status = cudnnBackendExecute(handle_, option.get_raw_desc(), variantPack.get_raw_desc());
+        }
+
+        cudaFree(reorderedData);
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/conv_sample.h b/third_party/cudnn-frontend/samples/legacy_samples/conv_sample.h
new file mode 100644
index 00000000..c51c3db1
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/conv_sample.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn_frontend.h>
+
+#include "./utils/fp16_dev.h"
+#include "./utils/fp16_emu.h"
+#include "./utils/helpers.h"
+
+void
+run_from_global_index(int64_t* dimA_padded,
+                      int64_t* padA,
+                      int64_t* convstrideA,
+                      int64_t* dilationA,
+                      int64_t* filterdimA_padded,
+                      int64_t* outdimA_padded,
+                      cudnnDataType_t dataType,
+                      cudnnConvolutionMode_t mode,
+                      float* devPtrI,
+                      float* devPtrF,
+                      float* devPtrO);
+
+void
+run_from_heuristics(int64_t* dimA_padded,
+                    int64_t* padA,
+                    int64_t* convstrideA,
+                    int64_t* dilationA,
+                    int64_t* filterdimA_padded,
+                    int64_t* outdimA_padded,
+                    cudnnDataType_t dataType,
+                    cudnnConvolutionMode_t mode,
+                    float* devPtrI,
+                    float* devPtrF,
+                    float* devPtrO,
+                    cudnnBackendHeurMode_t heur_mode,
+                    bool expect_in_cache = false);
+
+cudnnStatus_t
+run_with_external_config(int64_t* dimA_padded,
+                         int64_t* padA,
+                         int64_t* convstrideA,
+                         int64_t* dilationA,
+                         int64_t* filterdimA_padded,
+                         int64_t* outdimA_padded,
+                         cudnnDataType_t dataType,
+                         cudnnConvolutionMode_t mode,
+                         float* devPtrI,
+                         float* devPtrF,
+                         float* devPtrO);
+
+void
+run_conv_add_bias_activation(int64_t* x_dim_padded,
+                             int64_t* pad,
+                             int64_t* convstride,
+                             int64_t* dilation,
+                             int64_t* w_dim_padded,
+                             int64_t* y_dim_padded,
+                             cudnnDataType_t dataType,
+                             float* devPtrX,
+                             float* devPtrW,
+                             float* devPtrY,
+                             float* devPtrZ,
+                             float* devPtrB);
+
+void
+run_from_cudnn_find(int64_t* dimA_padded,
+                    int64_t* padA,
+                    int64_t* convstrideA,
+                    int64_t* dilationA,
+                    int64_t* filterdimA_padded,
+                    int64_t* outdimA_padded,
+                    cudnnDataType_t dataType,
+                    cudnnConvolutionMode_t mode,
+                    void* devPtrI,
+                    void* devPtrF,
+                    void* devPtrO);
+
+void
+run_conv_add_bias_activation_with_cudnn_find(int64_t* x_dim_padded,
+                                             int64_t* pad,
+                                             int64_t* convstride,
+                                             int64_t* dilation,
+                                             int64_t* w_dim_padded,
+                                             int64_t* y_dim_padded,
+                                             cudnnDataType_t dataType,
+                                             float* devPtrX,
+                                             float* devPtrW,
+                                             float* devPtrY,
+                                             float* devPtrZ,
+                                             float* devPtrB);
+
+void
+run_from_cudnn_get(int64_t* dimA_padded,
+                   int64_t* padA,
+                   int64_t* convstrideA,
+                   int64_t* dilationA,
+                   int64_t* filterdimA_padded,
+                   int64_t* outdimA_padded,
+                   cudnnDataType_t dataType,
+                   cudnnConvolutionMode_t mode,
+                   float* devPtrI,
+                   float* devPtrF,
+                   float* devPtrO);
+
+void
+block_using_errata(int64_t* dimA_padded,
+                   int64_t* padA,
+                   int64_t* convstrideA,
+                   int64_t* dilationA,
+                   int64_t* filterdimA_padded,
+                   int64_t* outdimA_padded,
+                   cudnnDataType_t dataType,
+                   cudnnConvolutionMode_t mode,
+                   float* devPtrI,
+                   float* devPtrF,
+                   float* devPtrO);
+
+void
+run_dp4a(int64_t* dimA_padded,
+         int64_t* padA,
+         int64_t* convstrideA,
+         int64_t* dilationA,
+         int64_t* filterdimA_padded,
+         int64_t* outdimA_padded,
+         cudnnConvolutionMode_t mode,
+         void* devPtrI,
+         void* devPtrF,
+         void* devPtrO,
+         int64_t vectorCount,
+         int64_t vectorDimension);
+
+void
+run_imma(int64_t* dimA_padded,
+         int64_t* padA,
+         int64_t* convstrideA,
+         int64_t* dilationA,
+         int64_t* filterdimA_padded,
+         int64_t* outdimA_padded,
+         cudnnConvolutionMode_t mode,
+         void* devPtrI,
+         void* devPtrF,
+         void* devPtrO,
+         int64_t vectorCount,
+         int64_t vectorDimension);
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/cpu_references.h b/third_party/cudnn-frontend/samples/legacy_samples/cpu_references.h
new file mode 100644
index 00000000..2c9bafcb
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/cpu_references.h
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "./utils/helpers.h"
+#include <cmath>
+
+template <typename T_ELEM>
+void
+weightGrad_cpu_ref(const T_ELEM* image,
+                   const T_ELEM* diffData,
+                   T_ELEM* output,
+                   cudnnTensorFormat_t filterFormat,
+                   const int64_t* inDims,
+                   const int64_t* filDims,
+                   const int64_t* diffDims,
+                   const int64_t* stride,
+                   const int64_t* pad,
+                   const int64_t* dilation,
+                   int nbDims) {
+    float alpha = 1.0f;
+    float beta  = 0.0;
+    // Some sanity checks
+    // image   is n x c x h x w
+    // diff    is n x k x p x q
+    // filter  is k x c x r x s
+    assert(inDims[0] == diffDims[0]);
+    assert(inDims[1] == filDims[1]);
+    assert(diffDims[1] == filDims[0]);
+
+    // Filter stride
+    int64_t filterStride[8];
+    int64_t inStride[8];
+    int64_t diffStride[8];
+
+    generateStrides(inDims, inStride, nbDims, filterFormat);
+    generateStrides(diffDims, diffStride, nbDims, filterFormat);
+    generateStrides(filDims, filterStride, nbDims, filterFormat);
+
+    bool isConv = true;  //(CUDNN_CONVOLUTION == mode) ;
+
+    // For every filter pixel (k x c x r x s)
+    for (size_t ci = 0; ci < (size_t)inDims[1]; ci++) {               // Loop over filter output pixels
+        for (size_t ri = 0; ri < (size_t)filDims[2]; ri++) {          //        ^
+            for (size_t si = 0; si < (size_t)filDims[3]; si++) {      //    ^
+                for (size_t ki = 0; ki < (size_t)filDims[0]; ki++) {  // ^
+                    int64_t filIdx =
+                        ki * filterStride[0] + ci * filterStride[1] + ri * filterStride[2] + si * filterStride[3];
+                    float val = 0.f;
+                    // For every image (n)
+                    for (int ni = 0; ni < inDims[0]; ni++) {  // Sum over the batch
+                        int64_t offset_image = ni * inStride[0] + ci * inStride[1];
+                        int64_t offset_diff  = ni * diffStride[0] + ki * diffStride[1];
+                        // For every pixel in diff (p x q)
+                        for (int pi = 0; pi < diffDims[2]; pi++) {      // Sum over the pixels of diff
+                            for (int qi = 0; qi < diffDims[3]; qi++) {  //  ^
+                                // Fetch the value in image and diff, product and accumulate
+                                int64_t y = pi * stride[0] - pad[0];
+                                int64_t x = qi * stride[1] - pad[1];
+                                // Convolution = Correlation with a flipped filter
+                                // So basically, for the convolution, we replace r by dim-1-r
+                                // and s by dim-1-s to "flip" the filter.
+                                // We can then just reason in term of correlation
+                                if (isConv) {
+                                    y += (filDims[2] - 1 - ri) * dilation[0];
+                                    x += (filDims[3] - 1 - si) * dilation[1];
+                                } else {
+                                    // The effect of dilation on the gradient is to start
+                                    // the "zone of influence" of a given pixel further
+                                    // into the image, so dilation
+                                    // only produces a shift in x and y
+                                    y += ri * dilation[0];
+                                    x += si * dilation[1];
+                                }
+                                // Image value
+                                int64_t inBounds = ((x >= 0) && (x < inDims[3]) && (y >= 0) && (y < inDims[2]));
+                                if (inBounds) {
+                                    int imIdx = static_cast<int>(offset_image + y * inStride[2] + x * inStride[3]);
+                                    // Diff value
+                                    int diffIdx =
+                                        static_cast<int>(offset_diff + pi * diffStride[2] + qi * diffStride[3]);
+                                    // Prod and accumulate
+                                    T_ELEM imTmp   = image[imIdx];
+                                    T_ELEM diffTmp = diffData[diffIdx];
+                                    val            = doFma(diffTmp, imTmp, val);
+                                }
+                            }
+                        }
+                    }
+                    doEpilog(output, filIdx, alpha * val, beta);
+                }
+            }
+        }
+    }
+}
+
+template <typename scale_bias_type = float>
+void
+scale_and_bias_tensor_cpu(const half* inputData,
+                          float* outputData,
+                          const scale_bias_type* scaleData,
+                          const scale_bias_type* biasData,
+                          const int64_t inputSize,
+                          const int64_t* inputDims) {
+    // Scale and bias per channel basis. Assumes NHWC format.
+    for (int i = 0; i < inputSize; i++) {
+        int c = i % inputDims[0];
+        if constexpr (std::is_same_v<scale_bias_type, float>) {
+            outputData[i] = (__half2float(inputData[i]) * scaleData[c] + biasData[c]);
+        } else {
+            outputData[i] = (__half2float(inputData[i]) * __half2float(scaleData[c]) + __half2float(biasData[c]));
+        }
+    }
+}
+
+template <typename inputType>
+void
+add_tensors_cpu(const inputType* firstInputData,
+                const inputType* secondInputData,
+                inputType* outputData,
+                const int64_t inputSize) {
+    for (int i = 0; i < inputSize; i++) {
+        outputData[i] = firstInputData[i] + secondInputData[i];
+    }
+}
+
+template <typename outputType>
+void
+relu(const float* inputData, outputType* outputData, const int64_t inputSize) {
+    for (int i = 0; i < inputSize; i++) {
+        float output = inputData[i] > 0.0f ? inputData[i] : 0.0f;
+        if constexpr (std::is_same_v<outputType, float>) {
+            outputData[i] = output;
+        } else {
+            outputData[i] = __float2half(output);
+        }
+    }
+}
+
+void
+gen_stats_cpu(const half* inputData,
+              std::vector<std::pair<float, float>>& outputData,
+              const int64_t inputSize,
+              const int64_t* inputDims) {
+    int64_t channel_dim = inputDims[1];
+    std::vector<int64_t> totals((size_t)channel_dim, 0);
+    for (int i = 0; i < inputSize; i++) {
+        int channel_index = i % channel_dim;
+
+        // Sum
+        outputData[channel_index].first = outputData[channel_index].first + __half2float(inputData[i]);
+        totals[channel_index]           = totals[channel_index] + 1;
+    }
+
+    // Calculate the mean for each channel. Assumes NHWC format.
+    for (size_t i = 0; i < outputData.size(); i++) {
+        outputData[i].first = outputData[i].first / totals[i];
+    }
+
+    for (int i = 0; i < inputSize; i++) {
+        int channel_index = i % channel_dim;
+
+        // Sum of squares
+        float diff = (__half2float(inputData[i]) - outputData[channel_index].first) *
+                     (__half2float(inputData[i]) - outputData[channel_index].first);
+        outputData[channel_index].second = outputData[channel_index].second + diff;
+    }
+
+    // Calculate the variance for the channel. Assumes NHWC format.
+    for (size_t i = 0; i < outputData.size(); i++) {
+        outputData[i].second = outputData[i].second / totals[i];
+    }
+}
+
+void
+batch_normalize(const half* inputData,
+                half* outputData,
+                const std::vector<std::pair<float, float>>& stats,
+                const int64_t inputSize,
+                const int64_t* inputDims) {
+    int64_t channel_dim = inputDims[1];
+    // Loop through each element in the input and normalize it based on what batch it belongs to
+    for (int i = 0; i < inputSize; i++) {
+        int batch_index = i % channel_dim;
+        outputData[i]   = __float2half((__half2float(inputData[i]) - stats[batch_index].first) /
+                                     (float)std::sqrt(stats[batch_index].second));
+    }
+}
+
+// T_ELEM is the type the data is stored in, T_MATH is the type the calculations are done in.
+template <typename T_ELEM, typename T_MATH>
+void
+conv_cpu_ref(const T_ELEM* inputData,
+             const T_ELEM* filterData,
+             T_ELEM* outputData,
+             int resizeFactor,
+             cudnnTensorFormat_t filterFormat,
+             const int64_t* inDims,
+             const int64_t* filDims,
+             const int64_t* diffDims,
+             const int64_t* stride,
+             const int64_t* pad,
+             const int64_t* dilation,
+             int64_t nbDims) {
+    int64_t imDims = nbDims - 2;
+    float alpha    = 1.0f;
+    float beta     = 0.0;
+    // Some sanity checks
+    // image   is n x c x h x w
+    // diff    is n x k x p x q
+    // filter  is k x c x r x s
+    assert(inDims[0] == diffDims[0]);
+    assert(inDims[1] == filDims[1]);
+    assert(diffDims[1] == filDims[0]);
+
+    // Filter stride
+    int64_t filterStride[8];
+    int64_t inStride[8];
+    int64_t diffStride[8];
+
+    generateStrides(inDims, inStride, nbDims, filterFormat);
+    generateStrides(diffDims, diffStride, nbDims, filterFormat);
+    generateStrides(filDims, filterStride, nbDims, filterFormat);
+
+    int64_t filStride[8] = {0};
+    generateStrides(filDims, filStride, nbDims, filterFormat);
+
+    bool isConv = true;  //(CUDNN_CONVOLUTION == mode) ;
+
+    // Number of pixels in output
+    int64_t nPixelsOut = 1;
+    for (int i = 2; i < nbDims; i++) {
+        nPixelsOut *= diffDims[i];
+    }
+
+    // Number of pixels in filter
+    int64_t nPixelsFil = 1;
+    for (int i = 2; i < nbDims; i++) {
+        nPixelsFil *= filDims[i];
+    }
+
+    // Used to store coordinates
+    int64_t filIds[8] = {0};
+    int64_t outIds[8] = {0};
+    int64_t inIds[8]  = {0};
+    int64_t tmpIds[8] = {0};
+
+    // For each image in the output
+    for (int64_t ni = 0; ni < diffDims[0]; ni++) {
+        // For each outer feature layer of the output image
+        for (int ki_outer = 0; ki_outer < diffDims[1] / resizeFactor; ki_outer++) {
+            int64_t outputOffset = ni * diffStride[0] / resizeFactor + ki_outer * diffStride[1];
+            // For every pixel in this output image's feature layer
+            for (int outId = 0; outId < nPixelsOut; outId++) {
+                // Get output pixel ids
+                lin2dim(outId, outIds, diffDims + 2, imDims);  // Skip n and k dimensions
+                // Now we get the coordinates in input space of the "top left" corner
+                // of the filter: multiply by stride and remove pad
+                for (int d = 0; d < imDims; d++) {
+                    inIds[d] = outIds[d] * stride[d] - pad[d];
+                }
+                // For each inner feature layer of the output image
+                for (int ki_inner = 0; ki_inner < resizeFactor; ki_inner++) {
+                    // We prepare to accumulate
+                    T_MATH tmp = 0;
+                    // For each outer feature layer of the input image and filter
+                    for (int ci = 0; ci < inDims[1] / resizeFactor; ci++) {
+                        int64_t inputOffset = ni * inStride[0] / resizeFactor + ci * inStride[1];
+                        int64_t filterOffset =
+                            (ki_outer * resizeFactor + ki_inner) * filStride[0] / resizeFactor + ci * filStride[1];
+                        // Now for every pixel in the filter
+                        for (int filId = 0; filId < nPixelsFil; filId++) {
+                            // Get the position of the pixel
+                            lin2dim(filId, filIds, filDims + 2, imDims);
+                            // Compute the corresponding output pixel
+                            // and check whether we are in the padding area on the fly too
+                            // (not that for convolution, we flip the image patch;
+                            // equivalent to flipping the filter patch).
+                            bool inside = true;
+                            for (int d = 0; d < imDims && inside; d++) {
+                                if (isConv) {
+                                    tmpIds[d] = inIds[d] + dilation[d] * (filDims[2 + d] - 1 - filIds[d]);
+                                } else {
+                                    tmpIds[d] = inIds[d] + dilation[d] * filIds[d];
+                                }
+                                // If we are in the padding area: stop and skip computations
+                                inside &= (tmpIds[d] >= 0 && tmpIds[d] < inDims[2 + d]);
+                            }
+                            if (inside) {
+                                int64_t actualTmpId = inputOffset + dim2lin(tmpIds, (inStride) + 2, imDims);
+                                // int actualFilId = filterOffset + filId ;
+                                int64_t actualFilId = filterOffset + dim2lin(filIds, (filStride) + 2, imDims);
+
+                                // For each inner feature layer of the input image and filter
+                                for (int i = 0; i < resizeFactor; i++) {
+                                    T_ELEM fval = filterData[(size_t)(actualFilId * resizeFactor + i)];
+                                    T_ELEM ival = inputData[(size_t)(actualTmpId * resizeFactor + i)];
+                                    tmp         = doFma(fval, ival, tmp);
+                                }
+                            }
+                        }
+                    }
+
+                    // Store final result in proper position in output image
+                    int64_t actualOutId = outputOffset + dim2lin(outIds, (diffStride) + 2, imDims);
+                    doEpilog(outputData, actualOutId * resizeFactor + ki_inner, alpha * tmp, beta);
+                }
+            }
+        }
+    }
+}
+
+template <typename T_ELEM>
+void
+dataGrad_cpu_ref(const T_ELEM* weight,
+                 const T_ELEM* top_diff,
+                 T_ELEM* output,
+                 cudnnTensorFormat_t filterFormat,
+                 const int64_t* inDims,
+                 const int64_t* filDims,
+                 const int64_t* outDims,
+                 const int64_t* stride,
+                 const int64_t* pad,
+                 const int64_t* dilation,
+                 int nbDims,
+                 cudnnConvolutionMode_t mode) {
+    // Sanity checks
+    // output is n x c x h x w
+    // diff   is n x k x p x q
+    // filter is k x c x r x s
+    assert(inDims[0] == outDims[0]);   // n
+    assert(inDims[1] == filDims[0]);   // k
+    assert(outDims[1] == filDims[1]);  // cactualOutId
+
+    int64_t inStride[8];
+    int64_t outStride[8];
+
+    float alpha = 1.0f;
+    float beta  = 0.0;
+
+    generateStrides(inDims, inStride, nbDims, filterFormat);
+    generateStrides(outDims, outStride, nbDims, filterFormat);
+
+    int64_t filStride[8] = {0};
+    generateStrides(filDims, filStride, nbDims, filterFormat);
+
+    // true for convolution and false for cross-correlation
+    bool isConv = (mode == CUDNN_CONVOLUTION) ? true : false;
+
+    // For every output pixel (n x c x h x w)
+    for (int ni = 0; ni < outDims[0]; ni++) {
+        for (int ci = 0; ci < outDims[1]; ci++) {
+            for (int hi = 0; hi < outDims[2]; hi++) {
+                for (int wi = 0; wi < outDims[3]; wi++) {
+                    int64_t outIdx = ni * outStride[0] + ci * outStride[1] + hi * outStride[2] + wi * outStride[3];
+                    float val      = 0.0;
+
+                    // For every diff channel (k)
+                    for (int ki = 0; ki < inDims[1]; ki++) {  // Sum over k channels
+                        int64_t offset_filter = ki * filStride[0] + ci * filStride[1];
+                        int64_t offset_diff   = ni * inStride[0] + ki * inStride[1];
+                        // For every pixel if filter (r x s)
+                        for (int ri = 0; ri < filDims[2]; ri++) {
+                            int64_t p = hi + pad[0];
+
+                            if (isConv) {
+                                p -= (filDims[2] - 1 - ri) * dilation[0];
+                            } else {
+                                p -= ri * dilation[0];
+                            }
+
+                            if (p % stride[0]) {
+                                continue;
+                            }
+
+                            p /= stride[0];
+
+                            for (int si = 0; si < filDims[3]; si++) {
+                                int64_t q = wi + pad[1];
+
+                                // Fetch the value in filter and diff, product and accumulate
+                                // So basically, for the convolution, we replace r by dim-1-r
+                                // and s by dim-1-s to "flip" the filter
+                                // We can then just reason in term of correlation
+                                if (isConv) {
+                                    q -= (filDims[3] - 1 - si) * dilation[1];
+                                } else {
+                                    q -= si * dilation[1];
+                                }
+
+                                // Skip if q or p isn't multiple of strides
+                                if (q % stride[1]) {
+                                    continue;
+                                }
+
+                                q /= stride[1];
+
+                                int inBounds = ((p >= 0) && (p < inDims[2]) && (q >= 0) && (q < inDims[3]));
+                                if (inBounds) {
+                                    int64_t filterIdx = offset_filter + ri * filStride[2] + si * filStride[3];
+                                    int64_t diffIdx   = offset_diff + p * inStride[2] + q * inStride[3];
+                                    T_ELEM imTmp      = top_diff[(size_t)diffIdx];
+                                    T_ELEM filTmp     = weight[(size_t)filterIdx];
+                                    val               = doFma(filTmp, imTmp, val);
+                                }
+                            }
+                        }
+                    }
+                    doEpilog(output, outIdx, alpha * val, beta);
+                }
+            }
+        }
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/f16_flash_mha_sample.cpp b/third_party/cudnn-frontend/samples/legacy_samples/f16_flash_mha_sample.cpp
new file mode 100644
index 00000000..1f352df2
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/f16_flash_mha_sample.cpp
@@ -0,0 +1,1300 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "f16_flash_mha_sample.h"
+#include <cudnn_frontend.h>
+#include "./utils/error_util.h"
+
+#define Q_ID 1
+#define K_ID 2
+#define V_ID 3
+#define O_ID 4
+#define S_ID 5
+#define B_ID 6
+#define D_CONST_ID 7
+#define S_CONST_ID 8
+#define Q_SEQLEN_ID 9
+#define K_SEQLEN_ID 10
+#define dQ_ID 11
+#define dK_ID 12
+#define dV_ID 13
+#define dO_ID 14
+#define MASK_VAL_ID 15
+#define dS_ID 16
+#define D_SEED_ID 17
+#define D_OFFSET_ID 18
+#define S_STATS_ID 19
+#define S_SUM_ID 20
+#define SCALE_PROB 21
+#define K_TRANSPOSE_ID 22
+#define dQ_ACCUM_ID 23
+
+#define VIRTUAL_ID 30
+
+static bool
+allowAllConfig(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+#if (CUDNN_VERSION >= 8900)
+static cudnn_frontend::Tensor
+tensor_create(cudnnDataType_t type,
+              int64_t id,
+              int64_t const* dim,
+              int64_t const* stride,
+              bool is_virtual,
+              bool is_value) {
+    int nbDims          = 4;
+    auto tensor_created = cudnn_frontend::TensorBuilder()
+                              .setDim(nbDims, dim)
+                              .setStride(nbDims, stride)
+                              .setId(id)
+                              .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                              .setDataType(type)
+                              .setVirtual(is_virtual)
+                              .setByValue(is_value)
+                              .build();
+    std::cout << tensor_created.describe() << std::endl;
+    return tensor_created;
+}
+
+static cudnn_frontend::PointWiseDesc
+pw_desc_create(cudnnDataType_t type, cudnnPointwiseMode_t mode) {
+    auto pw_desc_created = cudnn_frontend::PointWiseDescBuilder().setMode(mode).setComputeType(type).build();
+
+    std::cout << pw_desc_created.describe() << std::endl;
+    return pw_desc_created;
+}
+
+static cudnn_frontend::Operation
+unary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                   cudnn_frontend::Tensor const& yDesc,
+                   cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Operation
+binary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                    cudnn_frontend::Tensor const& bDesc,
+                    cudnn_frontend::Tensor const& yDesc,
+                    cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setbDesc(bDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Operation
+ternary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                     cudnn_frontend::Tensor const& bDesc,
+                     cudnn_frontend::Tensor const& tDesc,
+                     cudnn_frontend::Tensor const& yDesc,
+                     cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setbDesc(bDesc)
+                             .settDesc(tDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Tensor
+createScale(int64_t b,
+            int64_t h,
+            int64_t s_q,
+            int64_t s_kv,
+            int64_t d,
+            MHA_Layout layout,
+            cudnnDataType_t tensorType,
+            const cudnn_frontend::Tensor& sTensor,
+            std::vector<cudnn_frontend::Operation>& ops) {
+    // scale
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    int64_t s_dim[4] = {b, h, s_q, s_kv};
+    int64_t s_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, s_stride, layout, MHA_Matrix::S_Matrix);
+
+    auto scaleTensor  = tensor_create(tensorType, S_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+    auto sScaleTensor = tensor_create(tensorType, VIRTUAL_ID + 2000, s_dim, s_stride, true, false);   // is virtual
+
+    // Define the scale descriptor
+    auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a Scale Node.
+    auto scale_op = binary_pw_op_create(sTensor, scaleTensor, sScaleTensor, scaleDesc);
+
+    ops.push_back(std::move(scale_op));
+    return sScaleTensor;
+}
+
+static cudnn_frontend::Tensor
+createQKBMM(int64_t b,
+            int64_t h,
+            int64_t s_q,
+            int64_t s_kv,
+            int64_t d,
+            MHA_Layout layout,
+            cudnnDataType_t tensorType,
+            std::vector<cudnn_frontend::Operation>& ops) {
+    // Creates the necessary tensor descriptors
+    int64_t q_dim[4] = {b, h, s_q, d};
+    int64_t q_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+    int64_t k_dim[4] = {b, h, d, s_kv};
+    int64_t k_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix_Transpose);
+
+    int64_t s_dim[4] = {b, h, s_q, s_kv};
+    int64_t s_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, s_stride, layout, MHA_Matrix::S_Matrix);
+
+    auto qTensor          = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false);
+    auto kTransposeTensor = tensor_create(tensorType, K_ID, k_dim, k_stride, false, false);  // is virtual
+    // first GEMM output
+    auto sTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 1, s_dim, s_stride, true, false);  // is virtual
+
+    // Define the matmul 1 desc
+    auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+    std::cout << matmul_1_Desc.describe() << std::endl;
+
+    // Create a matmul 1 Node
+    auto matmul_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                          .setaMatDesc(qTensor)
+                          .setbMatDesc(kTransposeTensor)
+                          .setcMatDesc(sTensor)
+                          .setmatmulDesc(matmul_1_Desc)
+                          .build();
+
+    std::cout << matmul_op1.describe() << std::endl;
+
+    ops.push_back(std::move(matmul_op1));
+
+    return sTensor;
+}
+
+static cudnn_frontend::Tensor
+createCausalMask(int64_t b,
+                 int64_t h,
+                 int64_t s_q,
+                 int64_t s_kv,
+                 int64_t d,
+                 MHA_Layout layout,
+                 cudnnDataType_t tensorType,
+                 std::vector<cudnn_frontend::Operation>& ops,
+                 cudnn_frontend::Tensor& prevBlockOutputTensor) {
+    CUDNN_FRONTEND_UNUSED(d);
+    CUDNN_FRONTEND_UNUSED(layout);
+    CUDNN_FRONTEND_UNUSED(tensorType);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Padding Mask constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    // subtraction output
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t maskVal_dim[4]    = {1, 1, 1, 1};
+    int64_t maskVal_stride[4] = {1, 1, 1, 1};
+
+    // mask value to put in the masked pixels
+    auto maskValTensor =
+        tensor_create(CUDNN_DATA_FLOAT, MASK_VAL_ID, maskVal_dim, maskVal_stride, false, true);  // is by value
+    // gen index row output
+    auto rowIndexTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 100, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // gen index column output
+    auto columnIndexTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 101, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // create causal mask (row >= col)
+    auto causalMaskTensor = tensor_create(
+        CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 106, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // output after masking
+    auto maskOutputTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 107, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // Define the gen index for row descriptor
+    auto genIndexRowDesc = cudnn_frontend::PointWiseDescBuilder()
+                               .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                               .setAxis(2)
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .build();
+    std::cout << genIndexRowDesc.describe() << std::endl;
+
+    // Create a gen index Node.
+    auto genIndexRow_op = unary_pw_op_create(prevBlockOutputTensor, rowIndexTensor, genIndexRowDesc);
+    std::cout << genIndexRow_op.describe() << std::endl;
+
+    // Define the gen index for row descriptor
+    auto genIndexColumnDesc = cudnn_frontend::PointWiseDescBuilder()
+                                  .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                  .setAxis(3)
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .build();
+    std::cout << genIndexColumnDesc.describe() << std::endl;
+
+    // Create a gen index Node.
+    auto genIndexColumn_op = unary_pw_op_create(prevBlockOutputTensor, columnIndexTensor, genIndexColumnDesc);
+
+    // Define the greater than equal to comparison descriptor
+    auto rowGreaterColDesc = pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_CMP_GE);
+
+    // Create a greater than equal to Node.
+    auto rowGreaterCol_op = binary_pw_op_create(rowIndexTensor, columnIndexTensor, causalMaskTensor, rowGreaterColDesc);
+
+    /////////////////// Apply the mask //////////////////////////
+
+    // Define the binary select to perform masking descriptor
+    auto maskDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_BINARY_SELECT);
+
+    // Create a binary select Node.
+    auto mask_op =
+        ternary_pw_op_create(prevBlockOutputTensor, maskValTensor, causalMaskTensor, maskOutputTensor, maskDesc);
+
+    ops.push_back(std::move(genIndexRow_op));
+    ops.push_back(std::move(genIndexColumn_op));
+    ops.push_back(std::move(rowGreaterCol_op));
+    ops.push_back(std::move(mask_op));
+
+    return maskOutputTensor;
+}
+
+static cudnn_frontend::Tensor
+createSoftmaxForward(int64_t b,
+                     int64_t h,
+                     int64_t s_q,
+                     int64_t s_kv,
+                     bool isTraining,
+                     std::vector<cudnn_frontend::Operation>& ops,
+                     cudnn_frontend::Tensor& sAfterMaskTensor) {
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t afterReduction_dim[4]    = {b, h, s_q, 1};
+    int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1};
+
+    // max (x)
+    auto afterMaxReductionTensor = tensor_create(
+        CUDNN_DATA_FLOAT, VIRTUAL_ID + 150, afterReduction_dim, afterReduction_stride, true, false);  // is virtual
+
+    // x - max(x)
+    auto afterSubtractionTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 151, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // e^(x - max(x))
+    auto afterExponentTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 152, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual;
+
+    // sum (e^(x - max(x)))
+    auto afterAddReductionTensor = tensor_create(
+        CUDNN_DATA_FLOAT, VIRTUAL_ID + 153, afterReduction_dim, afterReduction_stride, true, false);  // is virtual
+
+    // log (sum (e^(x - max(x))))
+    auto afterLogLTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 154, afterReduction_dim, afterReduction_stride, true, false);
+
+    // M + log (sum (e^(x - max(x))))
+    auto softmaxStatsTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                            S_STATS_ID,
+                                            afterReduction_dim,
+                                            afterReduction_stride,
+                                            !isTraining,
+                                            false);  // not virtual if training is true, virtual if training is false
+
+    // divide (e/ sum(e))
+    auto afterSoftmaxTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, afterBMM1_dim)
+                                  .setStride(4, afterBMM1_stride)
+                                  .setId(VIRTUAL_ID + 156)
+                                  .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                  .setDataType(CUDNN_DATA_FLOAT)
+                                  .setVirtual(true)
+                                  .setByValue(false)
+                                  .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                  .build();
+
+    // Define the reduction descriptor
+    auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+                                .build();
+    std::cout << reductionMaxDesc.describe() << std::endl;
+
+    // Create a reduction max Node.
+    auto reductionMax_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(sAfterMaskTensor)
+                               .setyDesc(afterMaxReductionTensor)
+                               .setreductionDesc(reductionMaxDesc)
+                               .build();
+    std::cout << reductionMax_op.describe() << std::endl;
+
+    // Define the subtract descriptor
+    auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+
+    // Create a subtract Node.
+    auto subtract_op =
+        binary_pw_op_create(sAfterMaskTensor, afterMaxReductionTensor, afterSubtractionTensor, subtractDesc);
+
+    // Define the exponent descriptor
+    auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+
+    // Create a exponent Node.
+    auto exponent_op = unary_pw_op_create(afterSubtractionTensor, afterExponentTensor, exponentDesc);
+
+    // Define the reduction descriptor
+    auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+    std::cout << reductionAddDesc.describe() << std::endl;
+
+    // Create a reduction add Node.
+    auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(afterExponentTensor)
+                               .setyDesc(afterAddReductionTensor)
+                               .setreductionDesc(reductionAddDesc)
+                               .build();
+
+    std::cout << reductionAdd_op.describe() << std::endl;
+
+    // Create log descriptor
+    auto logDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_LOG);
+
+    // Create log node
+    auto log_op = unary_pw_op_create(afterAddReductionTensor, afterLogLTensor, logDesc);
+
+    // Create add descriptor
+    auto addDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_ADD);
+
+    // Create add node
+    auto add_op = binary_pw_op_create(afterMaxReductionTensor, afterLogLTensor, softmaxStatsTensor, addDesc);
+
+    // Define the division descriptor
+    auto divisionDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_DIV);
+
+    // Create a subtract Node.
+    auto division_op =
+        binary_pw_op_create(afterExponentTensor, afterAddReductionTensor, afterSoftmaxTensor, divisionDesc);
+
+    ops.push_back(std::move(reductionMax_op));
+    ops.push_back(std::move(subtract_op));
+    ops.push_back(std::move(exponent_op));
+    ops.push_back(std::move(reductionAdd_op));
+    ops.push_back(std::move(log_op));
+    ops.push_back(std::move(add_op));
+    ops.push_back(std::move(division_op));
+
+    return afterSoftmaxTensor;
+}
+
+static cudnn_frontend::Tensor
+createDropoutForward(int64_t b,
+                     int64_t h,
+                     int64_t s_q,
+                     int64_t s_kv,
+                     int64_t d,
+                     double probability,
+                     cudnnDataType_t tensorType,
+                     std::vector<cudnn_frontend::Operation>& ops,
+                     cudnn_frontend::Tensor& afterSoftmaxTensor) {
+    CUDNN_FRONTEND_UNUSED(d);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Dropout DAG constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    auto dropoutSeed =
+        tensor_create(CUDNN_DATA_INT64, D_SEED_ID, scale_dim, scale_stride, false, false);  // not virtual
+    auto dropoutOffset =
+        tensor_create(CUDNN_DATA_INT64, D_OFFSET_ID, scale_dim, scale_stride, false, false);  // not virtual
+
+    // mask for the dropout
+    auto dropoutMaskTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 200, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // after dropout tensor
+    auto afterDropoutTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, afterBMM1_dim)
+                                  .setStride(4, afterBMM1_stride)
+                                  .setId(VIRTUAL_ID + 201)
+                                  .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                  .setDataType(tensorType)
+                                  .setVirtual(true)
+                                  .setByValue(false)
+                                  .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                  .build();
+    // scale after dropout
+    auto scaleDropoutTensor =
+        tensor_create(tensorType, D_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+    // after Scale
+    auto afterScaleTensor =
+        tensor_create(tensorType, VIRTUAL_ID + 202, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // Define the reduction descriptor
+    auto rngDesc = cudnn_frontend::RngDescBuilder()
+                       .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                       .setBernoulliDistProbability(1.0 - probability)
+                       .build();
+    std::cout << rngDesc.describe() << std::endl;
+
+    // Create a rng Node.
+    auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+                      .setyDesc(dropoutMaskTensor)
+                      .setSeedDesc(dropoutSeed)
+                      .setOffsetDesc(dropoutOffset)
+                      .setRngDesc(rngDesc)
+                      .build();
+
+    std::cout << rng_op.describe() << std::endl;
+
+    // Define the multiply mask descriptor
+    auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto maskMul_op = binary_pw_op_create(afterSoftmaxTensor, dropoutMaskTensor, afterDropoutTensor, maskMulDesc);
+
+    // Define the multiply scale descriptor
+    auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply scale Node.
+    auto scaleMul_op = binary_pw_op_create(afterDropoutTensor, scaleDropoutTensor, afterScaleTensor, scaleMulDesc);
+
+    ops.push_back(std::move(rng_op));
+    ops.push_back(std::move(maskMul_op));
+    ops.push_back(std::move(scaleMul_op));
+
+    return afterScaleTensor;
+}
+
+static cudnn_frontend::Tensor
+createDropoutBackward(int64_t b,
+                      int64_t h,
+                      int64_t s_q,
+                      int64_t s_kv,
+                      int64_t d,
+                      double probability,
+                      cudnnDataType_t tensorType,
+                      std::vector<cudnn_frontend::Operation>& ops,
+                      cudnn_frontend::Tensor& afterSoftmaxTensor,
+                      cudnn_frontend::Tensor& dropoutMaskTensor) {
+    CUDNN_FRONTEND_UNUSED(d);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Dropout DAG constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    auto dropoutSeed =
+        tensor_create(CUDNN_DATA_INT64, D_SEED_ID, scale_dim, scale_stride, false, false);  // not virtual
+    auto dropoutOffset =
+        tensor_create(CUDNN_DATA_INT64, D_OFFSET_ID, scale_dim, scale_stride, false, false);  // not virtual
+
+    // after dropout tensor
+    auto afterDropoutTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, afterBMM1_dim)
+                                  .setStride(4, afterBMM1_stride)
+                                  .setId(VIRTUAL_ID + 201)
+                                  .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                  .setDataType(tensorType)
+                                  .setVirtual(true)
+                                  .setByValue(false)
+                                  .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                  .build();
+    // scale after dropout
+    auto scaleDropoutTensor =
+        tensor_create(tensorType, D_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+    // after Scale
+    auto afterScaleTensor =
+        tensor_create(tensorType, VIRTUAL_ID + 202, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // Define the reduction descriptor
+    auto rngDesc = cudnn_frontend::RngDescBuilder()
+                       .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                       .setBernoulliDistProbability(1.0 - probability)
+                       .build();
+    std::cout << rngDesc.describe() << std::endl;
+
+    // Create a rng Node.
+    auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+                      .setyDesc(dropoutMaskTensor)
+                      .setSeedDesc(dropoutSeed)
+                      .setOffsetDesc(dropoutOffset)
+                      .setRngDesc(rngDesc)
+                      .build();
+
+    std::cout << rng_op.describe() << std::endl;
+
+    // Define the multiply mask descriptor
+    auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto maskMul_op = binary_pw_op_create(afterSoftmaxTensor, dropoutMaskTensor, afterDropoutTensor, maskMulDesc);
+
+    // Define the multiply scale descriptor
+    auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply scale Node.
+    auto scaleMul_op = binary_pw_op_create(afterDropoutTensor, scaleDropoutTensor, afterScaleTensor, scaleMulDesc);
+
+    ops.push_back(std::move(rng_op));
+    ops.push_back(std::move(maskMul_op));
+    ops.push_back(std::move(scaleMul_op));
+
+    return afterScaleTensor;
+}
+
+static void
+createSVBMM(int64_t b,
+            int64_t h,
+            int64_t s_q,
+            int64_t s_kv,
+            int64_t d,
+            MHA_Layout layout,
+            cudnnDataType_t tensorType,
+            std::vector<cudnn_frontend::Operation>& ops,
+            cudnn_frontend::Tensor const& afterScaleDropoutTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "BMM2 op constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t v_dim[4] = {b, h, s_kv, d};
+    int64_t v_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, v_stride, layout, MHA_Matrix::V_Matrix);
+
+    int64_t o_dim[4] = {b, h, s_q, d};
+    int64_t o_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+    auto vTensor = tensor_create(tensorType, V_ID, v_dim, v_stride, false, false);
+    // second GEMM output
+    auto oTensor = tensor_create(tensorType, O_ID, o_dim, o_stride, false, false);
+
+    // Define the matmul 2 desc
+    auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+    std::cout << matmul_2_Desc.describe() << std::endl;
+
+    // Create a matmul 2 Node
+    auto matmul_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                          .setaMatDesc(afterScaleDropoutTensor)
+                          .setbMatDesc(vTensor)
+                          .setcMatDesc(oTensor)
+                          .setmatmulDesc(matmul_2_Desc)
+                          .build();
+
+    std::cout << matmul_op2.describe() << std::endl;
+
+    ops.push_back(std::move(matmul_op2));
+}
+
+static cudnn_frontend::Tensor
+createSoftmaxBackward(int64_t b,
+                      int64_t h,
+                      int64_t s_q,
+                      int64_t s_kv,
+                      int64_t d,
+                      MHA_Layout layout,
+                      cudnnDataType_t tensorType,
+                      std::vector<cudnn_frontend::Operation>& ops,
+                      cudnn_frontend::Tensor& yTensor,
+                      cudnn_frontend::Tensor& dyTensor) {
+    CUDNN_FRONTEND_UNUSED(tensorType);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Softmax backward constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t p_dim[4] = {b, h, s_q, s_kv};
+    int64_t p_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, p_stride, layout, MHA_Matrix::S_Matrix);
+
+    int64_t p_reduction_dim[4] = {b, h, s_q, 1};
+    int64_t p_reduction_stride[4];
+
+    p_reduction_stride[3] = 1;
+    p_reduction_stride[2] = 1;
+    p_reduction_stride[1] = s_q;
+    p_reduction_stride[0] = s_q * h;
+
+    int64_t const_dim[4]    = {1, 1, 1, 1};
+    int64_t const_stride[4] = {1, 1, 1, 1};
+
+    // creating all tensors
+    auto softmaxScaleTensor = tensor_create(CUDNN_DATA_FLOAT, S_CONST_ID, const_dim, const_stride, false, true);
+    auto dyMulYTensor       = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 250, p_dim, p_stride, true, false);
+    auto dxAfterReductionTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 251, p_reduction_dim, p_reduction_stride, true, false);
+    auto dxAfterSubtractionTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 252, p_dim, p_stride, true, false);
+    auto dxUnscaleTensor          = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 253, p_dim, p_stride, true, false);
+    auto dxTensor                 = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 254, p_dim, p_stride, true, false);
+
+    // creating all ops
+    // mul (y * dy)
+    auto mul_1_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+    auto mul_1_op   = binary_pw_op_create(yTensor, dyTensor, dyMulYTensor, mul_1_desc);
+
+    // reduction add sum (y * dy)
+    auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+    std::cout << reductionAddDesc.describe() << std::endl;
+
+    auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(dyMulYTensor)
+                               .setyDesc(dxAfterReductionTensor)
+                               .setreductionDesc(reductionAddDesc)
+                               .build();
+
+    std::cout << reductionAdd_op.describe() << std::endl;
+
+    // subtraction (dy - sum(y * dy))
+    auto sub_0_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+    auto sub_0_op   = binary_pw_op_create(dyTensor, dxAfterReductionTensor, dxAfterSubtractionTensor, sub_0_desc);
+
+    // mul (y * (dy - sum(y * dy)))
+    auto mul_2_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+    auto mul_2_op   = binary_pw_op_create(yTensor, dxAfterSubtractionTensor, dxUnscaleTensor, mul_2_desc);
+
+    // mul (scale * dx)
+    auto mul_3_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+    auto mul_3_op   = binary_pw_op_create(dxUnscaleTensor, softmaxScaleTensor, dxTensor, mul_3_desc);
+
+    ops.push_back(std::move(mul_1_op));
+    ops.push_back(std::move(reductionAdd_op));
+    ops.push_back(std::move(sub_0_op));
+    ops.push_back(std::move(mul_2_op));
+    ops.push_back(std::move(mul_3_op));
+
+    return dxTensor;
+}
+
+void
+run_f16_flash_attention_fprop(int64_t b,
+                              int64_t h,
+                              int64_t s_q,
+                              int64_t s_kv,
+                              int64_t d,
+                              MHA_Layout layout,
+                              float scaling_factor,
+                              bool isTraining,
+                              double dropout_probability,
+                              void* devPtrQ,
+                              void* devPtrK,
+                              void* devPtrV,
+                              void* devPtrSoftmaxStats,
+                              void* devPtrO,
+                              void* devPtrDropoutSeed,
+                              void* devPtrDropoutOffset,
+                              cudnnDataType_t tensorType) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        std::vector<cudnn_frontend::Operation const*> all_ops;
+        std::vector<cudnn_frontend::Operation> ops;
+        std::set<std::pair<uint64_t, void*>> data_ptrs;
+
+        // Q * K^T
+        auto sTensor = createQKBMM(b, h, s_q, s_kv, d, layout, tensorType, ops);
+
+        // Q * K^T * bmmScale
+        auto sScaleTensor = createScale(b, h, s_q, s_kv, d, layout, CUDNN_DATA_FLOAT, sTensor, ops);
+
+        // Causual mask
+        float negInfinity     = -1.0E+20f;  // change this if you have access to float_min
+        auto sAfterMaskTensor = createCausalMask(b, h, s_q, s_kv, d, layout, tensorType, ops, sScaleTensor);
+
+        cudnn_frontend::throw_if(dropout_probability != 0.0f && !isTraining,
+                                 "Dropout probability should be 0.0f for inference mode",
+                                 CUDNN_STATUS_BAD_PARAM);
+        cudnn_frontend::throw_if(
+            dropout_probability == 1.0f, "Dropout probability cannot be 1.0", CUDNN_STATUS_BAD_PARAM);
+
+        // needs to be bf16 (Please change)
+        half1 scale_dropout = cpu_float2half_rn(static_cast<float>(1 / (1 - dropout_probability)));
+
+        auto softmax_output = createSoftmaxForward(b, h, s_q, s_kv, isTraining, ops, sAfterMaskTensor);
+
+        // Dropout(softmax)
+        auto dropout_output =
+            createDropoutForward(b, h, s_q, s_kv, d, dropout_probability, tensorType, ops, softmax_output);
+        createSVBMM(b, h, s_q, s_kv, d, layout, tensorType, ops, dropout_output);
+
+        std::cout << "Total ops created: " << ops.size() << std::endl;
+
+        for (unsigned int i = 0; i < ops.size(); i++) {
+            all_ops.push_back(&ops[i]);
+        }
+
+        // Create an Operation Graph
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(all_ops.size(), all_ops.data())
+                           .build();
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_instant"}, opGraph, ::allowAllConfig, filtered_configs, true);
+
+        if (filtered_configs.size() == 0) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_NOT_SUPPORTED, "run_mha_fprop: No config returned by the heuristics");
+        }
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+
+        // add all the data pointers to be used in the variant pack
+        data_ptrs.insert(std::pair<uint64_t, void*>(Q_ID, devPtrQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(K_ID, devPtrK));
+        data_ptrs.insert(std::pair<uint64_t, void*>(V_ID, devPtrV));
+        data_ptrs.insert(std::pair<uint64_t, void*>(MASK_VAL_ID, &negInfinity));
+        data_ptrs.insert(std::pair<uint64_t, void*>(S_CONST_ID, &scaling_factor));
+        data_ptrs.insert(std::pair<uint64_t, void*>(O_ID, devPtrO));
+        data_ptrs.insert(std::pair<uint64_t, void*>(D_SEED_ID, devPtrDropoutSeed));
+        data_ptrs.insert(std::pair<uint64_t, void*>(D_OFFSET_ID, devPtrDropoutOffset));
+        data_ptrs.insert(std::pair<uint64_t, void*>(D_CONST_ID, &scale_dropout));
+
+        // If training mode, we write out softmax stats
+        if (isTraining) {
+            data_ptrs.insert(std::pair<uint64_t, void*>(S_STATS_ID, devPtrSoftmaxStats));
+        }
+
+        auto variantPack =
+            cudnn_frontend::VariantPackBuilder().setWorkspacePointer(workspace_ptr).setDataPointers(data_ptrs).build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GA100 cards and GH100 cards
+        if (!((prop.major == 8 && prop.minor == 0) || (prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8900)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GA100 (cuDNN >= 8900) and GH100 (cuDNN >= 8900) GPUs"
+                      << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+
+void
+run_f16_flash_attention_bprop(int64_t b,
+                              int64_t h,
+                              int64_t s_q,
+                              int64_t s_kv,
+                              int64_t d,
+                              MHA_Layout layout,
+                              float scaling_factor,
+                              float dropout_probability,
+                              void* devPtrQ,
+                              void* devPtrKTranspose,
+                              void* devPtrVTranspose,
+                              void* devPtrO,
+                              void* devPtrSoftmaxStats,
+                              void* devPtrSoftmaxSum,
+                              void* devPtrdQAccumulator,
+                              void* devPtrdQ,
+                              void* devPtrdK,
+                              void* devPtrdV,
+                              void* devPtrdO,
+                              void* devPtrDropoutSeed,
+                              void* devPtrDropoutOffset,
+                              cudnnDataType_t tensorType) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        std::vector<cudnn_frontend::Operation const*> all_ops;
+        std::vector<cudnn_frontend::Operation> ops;
+        std::set<std::pair<uint64_t, void*>> data_ptrs;
+
+        // Creates the necessary tensor descriptors
+        int64_t q_dim[4] = {b, h, s_q, d};
+        int64_t q_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+        int64_t k_transpose_dim[4] = {b, h, d, s_kv};
+        int64_t k_transpose_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, k_transpose_stride, layout, MHA_Matrix::K_Matrix_Transpose);
+
+        int64_t v_transpose_dim[4] = {b, h, d, s_kv};
+        int64_t v_transpose_stride[4];
+        generateMHAStrides(b,
+                           h,
+                           s_q,
+                           s_kv,
+                           d,
+                           v_transpose_stride,
+                           layout,
+                           MHA_Matrix::V_Matrix_Transpose);  // type is correct as V is transposed
+
+        int64_t p_dim[4] = {b, h, s_q, s_kv};
+        int64_t p_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, p_stride, layout, MHA_Matrix::S_Matrix);
+
+        int64_t p_transpose_dim[4] = {b, h, s_kv, s_q};
+        int64_t p_transpose_stride[4];
+        p_transpose_stride[0] = p_stride[0];
+        p_transpose_stride[1] = p_stride[1];
+        p_transpose_stride[2] = p_stride[3];
+        p_transpose_stride[3] = p_stride[2];
+
+        int64_t o_dim[4] = {b, h, s_q, d};
+        int64_t o_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+        int64_t dqAccum_dim[4] = {b, h, s_q, d};
+        int64_t dqAccum_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, dqAccum_stride, layout, MHA_Matrix::O_Matrix);
+
+        int64_t scale_dim[4]    = {1, 1, 1, 1};
+        int64_t scale_stride[4] = {1, 1, 1, 1};
+
+        /*******************************************************************************
+         *                          Dot product dO * O
+         *///////////////////////////////////////////////////////////////////////////////
+        // output and gradient of the output
+        auto oTensor  = tensor_create(tensorType, O_ID, o_dim, o_stride, false, false);
+        auto dOTensor = tensor_create(tensorType, dO_ID, o_dim, o_stride, false, false);
+
+        auto dotProductTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID, o_dim, o_stride, true, false);  // is virtual
+        // Create pointwise mul
+        auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+        // do * O
+        auto dotProductOp = binary_pw_op_create(dOTensor, oTensor, dotProductTensor, multiplyDesc);
+        ops.push_back(std::move(dotProductOp));
+
+        /*******************************************************************************
+         *                         Reduction(dO * O)
+         *///////////////////////////////////////////////////////////////////////////////
+
+        int64_t reduction_dim[4]    = {b, h, s_q, 1};
+        int64_t reduction_stride[4] = {h * s_q, s_q, 1, 1};
+        // reduction(dO * O)
+        auto afterReductionTensor = tensor_create(
+            CUDNN_DATA_FLOAT, VIRTUAL_ID + 1, reduction_dim, reduction_stride, true, false);  // is virtual
+        auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                    .setComputeType(CUDNN_DATA_FLOAT)
+                                    .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                    .build();
+        std::cout << reductionAddDesc.describe() << std::endl;
+
+        // Create a reduction add Node.
+        auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                   .setxDesc(dotProductTensor)
+                                   .setyDesc(afterReductionTensor)
+                                   .setreductionDesc(reductionAddDesc)
+                                   .build();
+        std::cout << reductionAdd_op.describe() << std::endl;
+        ops.push_back(std::move(reductionAdd_op));
+
+        /*******************************************************************************
+         *                        reduction(dO * O) * scale prob -> softmaxSum
+         *///////////////////////////////////////////////////////////////////////////////
+        auto softmaxSumTensor =
+            tensor_create(CUDNN_DATA_FLOAT, S_SUM_ID, reduction_dim, reduction_stride, false, false);  // not virtual
+        auto scaleProbTensor =
+            tensor_create(CUDNN_DATA_FLOAT, SCALE_PROB, scale_dim, scale_stride, false, true);  // not virtual
+        auto softmaxSumOp = binary_pw_op_create(afterReductionTensor, scaleProbTensor, softmaxSumTensor, multiplyDesc);
+        ops.push_back(std::move(softmaxSumOp));
+
+        /*******************************************************************************
+         *                        Q @ K.T -> P
+         *///////////////////////////////////////////////////////////////////////////////
+
+        // Inputs from fprop
+        auto qTensor          = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false);
+        auto kTransposeTensor = tensor_create(tensorType, K_ID, k_transpose_dim, k_transpose_stride, false, false);
+        auto pTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 2, p_dim, p_stride, true, false);  // is virtual
+
+        // matmul to calculate dvTensor
+        auto matmul_0_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_0_Desc.describe() << std::endl;
+
+        auto matmul_op0 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(qTensor)
+                              .setbMatDesc(kTransposeTensor)
+                              .setcMatDesc(pTensor)
+                              .setmatmulDesc(matmul_0_Desc)
+                              .build();
+
+        std::cout << matmul_op0.describe() << std::endl;
+        ops.push_back(std::move(matmul_op0));
+
+        /*******************************************************************************
+         *                        P * bmmScale -> pAfterScale
+         *///////////////////////////////////////////////////////////////////////////////
+        auto bmmScaleTensor = tensor_create(
+            CUDNN_DATA_FLOAT, S_CONST_ID, scale_dim, scale_stride, false, true);  // not virtual and by value
+        auto pAfterScaleTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 2000, p_dim, p_stride, true, false);  // virtual
+        auto scaleOp = binary_pw_op_create(pTensor, bmmScaleTensor, pAfterScaleTensor, multiplyDesc);
+        ops.push_back(std::move(scaleOp));
+
+        /*******************************************************************************
+         *                          Causal masking -> pAfterMaskTensor
+         *///////////////////////////////////////////////////////////////////////////////
+        float negInfinity     = -1.0E+20f;  // change this if you have access to float_min
+        auto pAfterMaskTensor = createCausalMask(b, h, s_q, s_kv, d, layout, tensorType, ops, pAfterScaleTensor);
+
+        /*******************************************************************************
+         *                          pAfterMaskTensor - softmaxStats -> pAfterSubtract
+         *///////////////////////////////////////////////////////////////////////////////
+        auto pAfterSubtractTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 3, p_dim, p_stride, true, false);  // is virtual
+        auto softmaxStatsTensor =
+            tensor_create(CUDNN_DATA_FLOAT, S_STATS_ID, reduction_dim, reduction_stride, false, false);  // not virtual
+        auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+        auto subtract_op =
+            binary_pw_op_create(pAfterMaskTensor, softmaxStatsTensor, pAfterSubtractTensor, subtractDesc);
+        ops.push_back(std::move(subtract_op));
+
+        /*******************************************************************************
+         *                          e^(pAfterSubtract) -> pAfterSoftmax
+         *///////////////////////////////////////////////////////////////////////////////
+        auto pAfterSoftmaxTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 4, p_dim, p_stride, true, false);  // is virtual
+        auto expDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+        auto exp_op  = unary_pw_op_create(pAfterSubtractTensor, pAfterSoftmaxTensor, expDesc);
+        ops.push_back(std::move(exp_op));
+
+        /*******************************************************************************
+         *                          Dropout -> afterScaleDropout
+         *///////////////////////////////////////////////////////////////////////////////
+        // mask for the dropout. used in rng and dS
+        auto dropoutMaskTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 5, p_dim, p_stride, true, false);  // is virtual
+        auto afterScaleDropoutTensor = createDropoutBackward(
+            b, h, s_q, s_kv, d, dropout_probability, tensorType, ops, pAfterSoftmaxTensor, dropoutMaskTensor);
+
+        /*******************************************************************************
+         *                          afterScaleDropout -> sTransposeTensor
+         *///////////////////////////////////////////////////////////////////////////////
+        auto sTransposeTensor =
+            tensor_create(tensorType, VIRTUAL_ID + 6, p_transpose_dim, p_transpose_stride, true, false);  // is virtual
+        auto reshape_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                              .setxDesc(afterScaleDropoutTensor)
+                              .setyDesc(sTransposeTensor)
+                              .build();
+        ops.push_back(std::move(reshape_op));
+
+        // Outputs of bprop
+        int64_t dq_dim[4] = {b, h, s_q, d};
+        int64_t dq_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, dq_stride, layout, MHA_Matrix::Q_Matrix);
+
+        int64_t dk_dim[4] = {b, h, s_kv, d};
+        int64_t dk_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, dk_stride, layout, MHA_Matrix::K_Matrix);
+
+        int64_t dv_dim[4] = {b, h, s_kv, d};
+        int64_t dv_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, dv_stride, layout, MHA_Matrix::V_Matrix);
+        // Outputs of backprop
+        auto dQTensor = tensor_create(tensorType, dQ_ID, dq_dim, dq_stride, false, false);
+        auto dKTensor = tensor_create(tensorType, dK_ID, dk_dim, dk_stride, false, false);
+        auto dVTensor = tensor_create(tensorType, dV_ID, dv_dim, dv_stride, false, false);  // not virtual
+
+        /*******************************************************************************
+         *                          sTransposeTensor @ dO -> dV
+         *///////////////////////////////////////////////////////////////////////////////
+        auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_1_Desc.describe() << std::endl;
+
+        auto matmul_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(sTransposeTensor)
+                              .setbMatDesc(dOTensor)
+                              .setcMatDesc(dVTensor)
+                              .setmatmulDesc(matmul_1_Desc)
+                              .build();
+
+        std::cout << matmul_op1.describe() << std::endl;
+        ops.push_back(std::move(matmul_op1));
+
+        /*******************************************************************************
+         *                          dO @ V.T -> dS
+         *///////////////////////////////////////////////////////////////////////////////
+        auto vTransposeTensor = tensor_create(tensorType, V_ID, v_transpose_dim, v_transpose_stride, false, false);
+        auto dSTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 7, p_dim, p_stride, true, false);  // is virtual
+
+        auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_1_Desc.describe() << std::endl;
+
+        auto matmul_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(dOTensor)
+                              .setbMatDesc(vTransposeTensor)
+                              .setcMatDesc(dSTensor)
+                              .setmatmulDesc(matmul_2_Desc)
+                              .build();
+
+        std::cout << matmul_op2.describe() << std::endl;
+        ops.push_back(std::move(matmul_op2));
+
+        /*******************************************************************************
+         *                          dS * dropoutMask -> dSAfterDropout
+         *///////////////////////////////////////////////////////////////////////////////
+        auto dSAfterDropoutTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 8, p_dim, p_stride, true, false);  // is virtual
+        auto multiply_op = binary_pw_op_create(dSTensor, dropoutMaskTensor, dSAfterDropoutTensor, multiplyDesc);
+        ops.push_back(std::move(multiply_op));
+
+        /*******************************************************************************
+         *                          dSAfterDropout - softmaxSum -> dsAfterSubtract
+         *///////////////////////////////////////////////////////////////////////////////
+        auto dsAfterSubtractTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 9, p_dim, p_stride, true, false);  // is virtual
+        auto subtract_op2 =
+            binary_pw_op_create(dSAfterDropoutTensor, softmaxSumTensor, dsAfterSubtractTensor, subtractDesc);
+        ops.push_back(std::move(subtract_op2));
+
+        /*******************************************************************************
+         *                          dsAfterSubtract * afterSoftmax -> dP
+         *///////////////////////////////////////////////////////////////////////////////
+        auto dPTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 10, p_dim, p_stride, true, false);  // is virtual
+        auto multiply_op2 = binary_pw_op_create(dsAfterSubtractTensor, pAfterSoftmaxTensor, dPTensor, multiplyDesc);
+        ops.push_back(std::move(multiply_op2));
+
+        /*******************************************************************************
+         *                          dP * scaleDropout -> dPAfterDropoutScale
+         *///////////////////////////////////////////////////////////////////////////////
+        auto dPAfterDropoutScaleTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 11, p_dim, p_stride, true, false);  // is virtual
+        // needs to be bf16 (Please change)
+        half1 scale_dropout = cpu_float2half_rn(static_cast<float>(1 / (1 - dropout_probability)));
+        auto scaleDropoutTensor =
+            tensor_create(tensorType, D_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+        auto multiply_op3 = binary_pw_op_create(dPTensor, scaleDropoutTensor, dPAfterDropoutScaleTensor, multiplyDesc);
+        ops.push_back(std::move(multiply_op3));
+
+        /*******************************************************************************
+         *                          dPAfterDropoutScale * bmmScale -> dPScaledTensor
+         *///////////////////////////////////////////////////////////////////////////////
+        auto dPScaledTensor =
+            tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 12, p_dim, p_stride, true, false);  // is virtual
+        auto multiply_op4 =
+            binary_pw_op_create(dPAfterDropoutScaleTensor, bmmScaleTensor, dPScaledTensor, multiplyDesc);
+        ops.push_back(std::move(multiply_op4));
+
+        /*******************************************************************************
+         *                          K.T -> K
+         *///////////////////////////////////////////////////////////////////////////////
+        int64_t kDim[4] = {b, h, s_kv, d};
+        int64_t kStride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, kStride, layout, MHA_Matrix::K_Matrix);
+        auto kTensor     = tensor_create(tensorType, VIRTUAL_ID + 13, kDim, kStride, true, false);  // is virtual
+        auto reshape_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                               .setxDesc(kTransposeTensor)
+                               .setyDesc(kTensor)
+                               .build();
+        std::cout << reshape_op2.describe() << std::endl;
+        ops.push_back(std::move(reshape_op2));
+
+        /*******************************************************************************
+         *                          dP @ K -> dqAccumTensor
+         *///////////////////////////////////////////////////////////////////////////////
+        auto dqAccumTensor =
+            cudnn_frontend::TensorBuilder()
+                .setDim(4, dqAccum_dim)
+                .setStride(4, dqAccum_stride)
+                .setId(dQ_ACCUM_ID)
+                .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                .setDataType(CUDNN_DATA_FLOAT)
+                .setVirtual(false)
+                .setByValue(false)
+                .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)  // dqAccum has reorder type
+                .build();
+
+        auto matmul_3_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        auto matmul_op3    = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(dPScaledTensor)
+                              .setbMatDesc(kTensor)
+                              .setcMatDesc(dqAccumTensor)
+                              .setmatmulDesc(matmul_3_Desc)
+                              .build();
+
+        std::cout << matmul_op3.describe() << std::endl;
+        ops.push_back(std::move(matmul_op3));
+
+        /*******************************************************************************
+         *                          dP.T @ Q -> dK
+         *///////////////////////////////////////////////////////////////////////////////
+
+        auto dPTransposeTensor = tensor_create(
+            CUDNN_DATA_FLOAT, VIRTUAL_ID + 14, p_transpose_dim, p_transpose_stride, true, false);  // is virtual
+        auto reshape_op3 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                               .setxDesc(dPScaledTensor)
+                               .setyDesc(dPTransposeTensor)
+                               .build();
+        std::cout << reshape_op3.describe() << std::endl;
+        ops.push_back(std::move(reshape_op3));
+
+        auto matmul_4_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        auto matmul_op4    = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(dPTransposeTensor)
+                              .setbMatDesc(qTensor)
+                              .setcMatDesc(dKTensor)
+                              .setmatmulDesc(matmul_4_Desc)
+                              .build();
+
+        std::cout << matmul_op4.describe() << std::endl;
+        ops.push_back(std::move(matmul_op4));
+
+        /*******************************************************************************
+         *                          dqAccumTensor @ identity -> dqTensor
+         *///////////////////////////////////////////////////////////////////////////////
+        auto identityDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_IDENTITY);
+        auto identity_op  = unary_pw_op_create(dqAccumTensor, dQTensor, identityDesc);
+        ops.push_back(std::move(identity_op));
+
+        std::cout << "Total ops created: " << ops.size() << std::endl;
+
+        for (unsigned int i = 0; i < ops.size(); i++) {
+            all_ops.push_back(&ops[i]);
+        }
+
+        // Create an Operation Graph
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(all_ops.size(), all_ops.data())
+                           .build();
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_instant"}, opGraph, ::allowAllConfig, filtered_configs, true);
+
+        if (filtered_configs.size() == 0) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_NOT_SUPPORTED, "run_mha_bprop: No config returned by the heuristics");
+        }
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+
+        // add all the data pointers to be used in the variant pack
+        data_ptrs.insert(std::pair<uint64_t, void*>(dQ_ID, devPtrdQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dQ_ACCUM_ID, devPtrdQAccumulator));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dK_ID, devPtrdK));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dV_ID, devPtrdV));
+
+        data_ptrs.insert(std::pair<uint64_t, void*>(Q_ID, devPtrQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(K_ID, devPtrKTranspose));
+        data_ptrs.insert(std::pair<uint64_t, void*>(V_ID, devPtrVTranspose));
+        data_ptrs.insert(std::pair<uint64_t, void*>(O_ID, devPtrO));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dO_ID, devPtrdO));
+        data_ptrs.insert(std::pair<uint64_t, void*>(S_STATS_ID, devPtrSoftmaxStats));
+        data_ptrs.insert(std::pair<uint64_t, void*>(S_SUM_ID, devPtrSoftmaxSum));
+        data_ptrs.insert(std::pair<uint64_t, void*>(D_SEED_ID, devPtrDropoutSeed));
+        data_ptrs.insert(std::pair<uint64_t, void*>(D_OFFSET_ID, devPtrDropoutOffset));
+        data_ptrs.insert(std::pair<uint64_t, void*>(MASK_VAL_ID, &negInfinity));
+
+        float scaleProb = 1.0f - dropout_probability;
+        data_ptrs.insert(std::pair<uint64_t, void*>(D_CONST_ID, &scale_dropout));
+        data_ptrs.insert(std::pair<uint64_t, void*>(S_CONST_ID, &scaling_factor));
+        data_ptrs.insert(std::pair<uint64_t, void*>(SCALE_PROB, &scaleProb));
+
+        auto variantPack =
+            cudnn_frontend::VariantPackBuilder().setWorkspacePointer(workspace_ptr).setDataPointers(data_ptrs).build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GA100 cards and GH100 cards
+        if (!((prop.major == 8 && prop.minor == 0) || (prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8900)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GA100 (cuDNN >= 8900) and GH100 (cuDNN >= 8900) GPUs"
+                      << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+#endif
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/f16_flash_mha_sample.h b/third_party/cudnn-frontend/samples/legacy_samples/f16_flash_mha_sample.h
new file mode 100644
index 00000000..f7403a54
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/f16_flash_mha_sample.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn.h>
+#include "./utils/fp16_dev.h"
+#include "./utils/fp16_emu.h"
+#include "./utils/helpers.h"
+
+#if (CUDNN_VERSION >= 8900)
+
+void
+run_f16_flash_attention_fprop(int64_t b,
+                              int64_t h,
+                              int64_t s_q,
+                              int64_t s_kv,
+                              int64_t d,
+                              MHA_Layout layout,
+                              float scaling_factor,
+                              bool isTraining,
+                              double dropout_probability,
+                              void* devPtrQ,
+                              void* devPtrK,
+                              void* devPtrV,
+                              void* devPtrSoftmaxStats,
+                              void* devPtrO,
+                              void* devPtrDropoutSeed,
+                              void* devPtrDropoutOffset,
+                              cudnnDataType_t tensorType);
+
+void
+run_f16_flash_attention_bprop(int64_t b,
+                              int64_t h,
+                              int64_t s_q,
+                              int64_t s_kv,
+                              int64_t d,
+                              MHA_Layout layout,
+                              float scaling_factor,
+                              float dropout_probability,
+                              void* devPtrQ,
+                              void* devPtrKTranspose,
+                              void* devPtrVTranspose,
+                              void* devPtrO,
+                              void* devPtrSoftmaxStats,
+                              void* devPtrSoftmaxSum,
+                              void* devPtrdQAccumulator,
+                              void* devPtrdQ,
+                              void* devPtrdK,
+                              void* devPtrdV,
+                              void* devPtrdO,
+                              void* devPtrDropoutSeed,
+                              void* devPtrDropoutOffset,
+                              cudnnDataType_t tensorType);
+
+#endif
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fp16_dev.cu b/third_party/cudnn-frontend/samples/legacy_samples/fp16_dev.cu
new file mode 100644
index 00000000..d2daa2a0
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fp16_dev.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "./utils/error_util.h"
+#include "./utils/fp16_dev.h"
+
+#define BLOCK_SIZE 128
+template <class value_type>
+__global__ void
+float2half_rn_kernel(int size, const value_type *buffIn, half1 *buffOut) {
+    const int idx = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+    if (idx >= size) {
+        return;
+    }
+#if CUDART_VERSION < 9000
+    half1 val;
+    val.x = __float2half_rn(float(buffIn[idx]));
+#else
+    half1 val = __float2half_rn(float(buffIn[idx]));
+#endif
+    buffOut[idx] = val;
+}
+
+template <class value_type>
+void
+gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut) {
+    int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    float2half_rn_kernel<value_type><<<grid_size, BLOCK_SIZE>>>(size, buffIn, buffOut);
+    checkCudaErrors(cudaDeviceSynchronize());
+}
+
+template void
+gpu_float2half_rn<float>(int, const float *, half1 *);
+template void
+gpu_float2half_rn<double>(int, const double *, half1 *);
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fp16_emu.cpp b/third_party/cudnn-frontend/samples/legacy_samples/fp16_emu.cpp
new file mode 100644
index 00000000..6a2650b5
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fp16_emu.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "./utils/fp16_emu.h"
+
+#define STATIC_ASSERT(cond)                           \
+    {                                                 \
+        static_assert(cond, "static_assert failed."); \
+    }
+
+// Host functions for converting between FP32 and FP16 formats
+// Paulius Micikevicius (pauliusm@nvidia.com)
+
+half1
+cpu_float2half_rn(float f) {
+    void* f_ptr = &f;
+    unsigned x  = *((int*)f_ptr);
+    unsigned u  = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned sign, exponent, mantissa;
+
+    __half_raw hr;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+        hr.x = 0x7fffU;
+        // Add an indirection to get around type aliasing check
+        void* hr_ptr = &hr;
+        return *reinterpret_cast<half1*>(hr_ptr);
+    }
+
+    sign = ((x >> 16) & 0x8000);
+
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+        hr.x = static_cast<unsigned short>(sign | 0x7c00U);
+        // Add an indirection to get around type aliasing check
+        void* hr_ptr = &hr;
+        return *reinterpret_cast<half1*>(hr_ptr);
+    }
+    if (u < 0x33000001) {
+        hr.x = static_cast<unsigned short>(sign | 0x0000U);
+        // Add an indirection to get around type aliasing check
+        void* hr_ptr = &hr;
+        return *reinterpret_cast<half1*>(hr_ptr);
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    } else {
+        shift    = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb    = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }
+
+    hr.x = static_cast<unsigned short>((sign | (exponent << 10) | mantissa));
+
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+float
+cpu_half2float(half1 h) {
+    STATIC_ASSERT(sizeof(int) == sizeof(float));
+
+    // Add an indirection to get around type aliasing check
+    void* h_ptr   = &h;
+    __half_raw hr = *reinterpret_cast<__half_raw*>(h_ptr);
+
+    unsigned sign     = ((hr.x >> 15) & 1);
+    unsigned exponent = ((hr.x >> 10) & 0x1f);
+    unsigned mantissa = ((hr.x & 0x3ff) << 13);
+
+    if (exponent == 0x1f) { /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    } else if (!exponent) { /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1; /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70;
+    }
+
+    int temp = ((sign << 31) | (exponent << 23) | mantissa);
+
+    // Add an indirection to get around type aliasing check
+    void* temp_ptr = &temp;
+    float* res_ptr = reinterpret_cast<float*>(temp_ptr);
+    return *res_ptr;
+}
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fp8_flash_mha_sample.cpp b/third_party/cudnn-frontend/samples/legacy_samples/fp8_flash_mha_sample.cpp
new file mode 100644
index 00000000..883d4141
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fp8_flash_mha_sample.cpp
@@ -0,0 +1,1937 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "fp8_flash_mha_sample.h"
+#include <cudnn_frontend.h>
+#include "./utils/error_util.h"
+
+#if (CUDNN_VERSION >= 8900)
+std::unordered_map<std::string, int> tensor_name_to_uid = {{"Q", 1},
+                                                           {"K", 2},
+                                                           {"V", 3},
+                                                           {"O", 4},
+                                                           {"S", 5},
+                                                           {"B", 6},
+                                                           {"DROPOUT_SCALE", 7},
+                                                           {"S_CONST", 8},
+                                                           {"MNK_OVERRIDE", 9},
+                                                           {"dQ", 11},
+                                                           {"dK", 12},
+                                                           {"dV", 13},
+                                                           {"dO", 14},
+                                                           {"MASK_VAL", 15},
+                                                           {"dS", 16},
+                                                           {"O_SEQLEN", 17},
+                                                           {"M", 18},
+                                                           {"Z", 19},
+                                                           {"descaleQ", 20},
+                                                           {"descaleK", 21},
+                                                           {"descaleV", 22},
+                                                           {"descaleS", 23},
+                                                           {"scaleS", 24},
+                                                           {"amaxS", 25},
+                                                           {"amaxO", 26},
+                                                           {"QKV_RAGGED", 27},
+                                                           {"O_RAGGED", 28},
+                                                           {"K_TRANSPOSE", 29},
+                                                           {"AttnScale", 30},
+                                                           {"scaleO", 31},
+                                                           {"Z_INV", 32},
+                                                           {"descaleO", 33},
+                                                           {"descaledO", 34},
+                                                           {"descaledS", 35},
+                                                           {"descaledQ", 36},
+                                                           {"descaledK", 37},
+                                                           {"descaledV", 38},
+                                                           {"scaledS", 39},
+                                                           {"scaledQ", 40},
+                                                           {"scaledK", 41},
+                                                           {"scaledV", 42},
+                                                           {"amaxdS", 43},
+                                                           {"amaxdQ", 44},
+                                                           {"amaxdK", 45},
+                                                           {"amaxdV", 46},
+                                                           {"V_TRANSPOSE", 47},
+                                                           {"AttnScale_dS_K", 48},
+                                                           {"AttnScale_dSTranspose_Q", 49},
+                                                           {"DROPOUT_SCALE_dOVt_OdO", 50},
+                                                           {"DROPOUT_OFFSET", 51},
+                                                           {"DROPOUT_SEED", 52},
+                                                           {"VIRTUAL", 80}};
+
+bool
+allowAllConfig(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+static cudnn_frontend::Tensor
+tensor_create(cudnnDataType_t type,
+              int64_t id,
+              int64_t const* dim,
+              int64_t const* stride,
+              bool is_virtual,
+              bool is_value) {
+    int nbDims          = 4;
+    auto tensor_created = cudnn_frontend::TensorBuilder()
+                              .setDim(nbDims, dim)
+                              .setStride(nbDims, stride)
+                              .setId(id)
+                              .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                              .setDataType(type)
+                              .setVirtual(is_virtual)
+                              .setByValue(is_value)
+                              .build();
+    std::cout << tensor_created.describe() << std::endl;
+    return tensor_created;
+}
+
+static cudnn_frontend::Tensor
+tensor_create_with_offset(cudnnDataType_t type,
+                          int64_t id,
+                          int64_t const* dim,
+                          int64_t const* stride,
+                          bool is_virtual,
+                          bool is_value,
+                          std::shared_ptr<cudnn_frontend::Tensor>& raggedOffset) {
+    int nbDims          = 4;
+    auto tensor_created = cudnn_frontend::TensorBuilder()
+                              .setDim(nbDims, dim)
+                              .setStride(nbDims, stride)
+                              .setId(id)
+                              .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                              .setDataType(type)
+                              .setVirtual(is_virtual)
+                              .setByValue(is_value)
+                              .setRaggedOffset(raggedOffset)
+                              .build();
+    std::cout << tensor_created.describe() << std::endl;
+    return tensor_created;
+}
+
+static cudnn_frontend::PointWiseDesc
+pw_desc_create(cudnnDataType_t type, cudnnPointwiseMode_t mode) {
+    auto pw_desc_created = cudnn_frontend::PointWiseDescBuilder().setMode(mode).setComputeType(type).build();
+
+    std::cout << pw_desc_created.describe() << std::endl;
+    return pw_desc_created;
+}
+
+static cudnn_frontend::Operation
+unary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                   cudnn_frontend::Tensor const& yDesc,
+                   cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Operation
+binary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                    cudnn_frontend::Tensor const& bDesc,
+                    cudnn_frontend::Tensor const& yDesc,
+                    cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setbDesc(bDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Operation
+ternary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                     cudnn_frontend::Tensor const& bDesc,
+                     cudnn_frontend::Tensor const& tDesc,
+                     cudnn_frontend::Tensor const& yDesc,
+                     cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setbDesc(bDesc)
+                             .settDesc(tDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Tensor
+createAmax(const std::string& amax_tensor_name,
+           cudnn_frontend::Tensor& prevBlockOutputTensor,
+           std::vector<cudnn_frontend::Operation>& ops) {
+    // Amax is just a scalar
+    int64_t amax_dim[4]    = {1, 1, 1, 1};
+    int64_t amax_stride[4] = {1, 1, 1, 1};
+
+    auto amaxTensor =
+        tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid[amax_tensor_name], amax_dim, amax_stride, false, false);
+
+    // Define the amax descriptor
+    auto redunctionDesc = cudnn_frontend::ReductionDescBuilder()
+                              .setMathPrecision(CUDNN_DATA_FLOAT)
+                              .setReductionOp(CUDNN_REDUCE_TENSOR_AMAX)
+                              .build();
+    std::cout << redunctionDesc.describe() << std::endl;
+
+    // Create a reduction amax Node.
+    auto reduction_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                            .setxDesc(prevBlockOutputTensor)
+                            .setyDesc(amaxTensor)
+                            .setreductionDesc(redunctionDesc)
+                            .build();
+    std::cout << reduction_op.describe() << std::endl;
+    ops.push_back(std::move(reduction_op));
+    return amaxTensor;
+}
+
+static cudnn_frontend::Tensor
+createScale(cudnn_frontend::Tensor& prevBlockOutputTensor,
+            const std::string& scale_tensor_name,
+            cudnnDataType_t tensorType,
+            bool isOutputVirtual,
+            bool isScaleByValue,
+            std::vector<cudnn_frontend::Operation>& ops,
+            const std::string& output_tensor_name = "") {
+    // scale
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    int64_t output_dim[4];
+    int64_t output_stride[4];
+
+    // output dim and stride should be the same as prev block dim and stride
+    for (int i = 0; i < 4; i++) {
+        output_dim[i]    = prevBlockOutputTensor.getDim()[i];
+        output_stride[i] = prevBlockOutputTensor.getStride()[i];
+    }
+
+    auto scaleTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                     tensor_name_to_uid[scale_tensor_name],
+                                     scale_dim,
+                                     scale_stride,
+                                     false,
+                                     isScaleByValue);  // is by value
+
+    // Hack to get the virtual id to not be same for all the virtual tensors
+    int64_t outputUID = isOutputVirtual ? tensor_name_to_uid["VIRTUAL"] + tensor_name_to_uid[scale_tensor_name] + 5000
+                                        : tensor_name_to_uid[output_tensor_name];
+    auto afterScaleKTensor =
+        tensor_create(tensorType, outputUID, output_dim, output_stride, isOutputVirtual, false);  // is virtual
+
+    // Define the scale descriptor
+    auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a Scale Node.
+    auto scale_op = binary_pw_op_create(prevBlockOutputTensor, scaleTensor, afterScaleKTensor, scaleDesc);
+
+    ops.push_back(std::move(scale_op));
+    return afterScaleKTensor;
+}
+
+static cudnn_frontend::Tensor
+createScale(cudnn_frontend::Tensor& prevBlockOutputTensor,
+            const cudnn_frontend::Tensor& scaleTensor,
+            cudnnDataType_t tensorType,
+            bool isOutputVirtual,
+            bool isScaleByValue,
+            std::vector<cudnn_frontend::Operation>& ops,
+            int UID_offset,
+            const std::string& output_tensor_name = "") {
+    CUDNN_FRONTEND_UNUSED(isScaleByValue);
+    int64_t output_dim[4];
+    int64_t output_stride[4];
+    // output dim and stride should be the same as prev block dim and stride
+    for (int i = 0; i < 4; i++) {
+        output_dim[i]    = prevBlockOutputTensor.getDim()[i];
+        output_stride[i] = prevBlockOutputTensor.getStride()[i];
+    }
+
+    // Hack to get the virtual id to not be same for all the virtual tensors
+    int64_t outputUID =
+        isOutputVirtual ? tensor_name_to_uid["VIRTUAL"] + UID_offset : tensor_name_to_uid[output_tensor_name];
+    auto afterScaleTensor =
+        tensor_create(tensorType, outputUID, output_dim, output_stride, isOutputVirtual, false);  // is virtual
+
+    // Define the scale descriptor
+    auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a Scale Node.
+    auto scale_op = binary_pw_op_create(prevBlockOutputTensor, scaleTensor, afterScaleTensor, scaleDesc);
+
+    ops.push_back(std::move(scale_op));
+    return afterScaleTensor;
+}
+
+static cudnn_frontend::Tensor
+createScaleWithOffset(cudnn_frontend::Tensor& prevBlockOutputTensor,
+                      const std::string& scale_tensor_name,
+                      cudnnDataType_t tensorType,
+                      bool isOutputVirtual,
+                      bool isScaleByValue,
+                      std::vector<cudnn_frontend::Operation>& ops,
+                      std::shared_ptr<cudnn_frontend::Tensor>& offsetTensor,
+                      const std::string& output_tensor_name = "") {
+    // scale
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    int64_t output_dim[4];
+    int64_t output_stride[4];
+    // If output tensor is dQ, dK, or dV, we need to generate QKV interleaved strides
+    if (output_tensor_name == "dQ" || output_tensor_name == "dK" || output_tensor_name == "dV") {
+        // Dims remain the same from previous block
+        for (int i = 0; i < 4; i++) {
+            output_dim[i] = prevBlockOutputTensor.getDim()[i];
+        }
+        // We know that dQ, dK, and dV are dims [batch, head, s_q, embedding_dim]
+        // All dQ, dK, and dV strides will follow Q_Matrix stride layout
+        generateMHAStrides(output_dim[0],
+                           output_dim[1],
+                           output_dim[2],
+                           0 /*s_kv = 0 for placeholder*/,
+                           output_dim[3],
+                           output_stride,
+                           MHA_Layout::QKV_INTERLEAVED,
+                           MHA_Matrix::Q_Matrix);
+    } else {
+        // otherwise output dim and stride should be the same as prev block dim and stride
+        for (int i = 0; i < 4; i++) {
+            output_dim[i]    = prevBlockOutputTensor.getDim()[i];
+            output_stride[i] = prevBlockOutputTensor.getStride()[i];
+        }
+    }
+
+    auto scaleTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                     tensor_name_to_uid[scale_tensor_name],
+                                     scale_dim,
+                                     scale_stride,
+                                     false,
+                                     isScaleByValue);  // is by value
+
+    cudnnDataType_t outputDataType = isOutputVirtual ? CUDNN_DATA_FLOAT : tensorType;
+    // Hack to get the virtual id to not be same for all the virtual tensors
+    int64_t outputUID = isOutputVirtual ? tensor_name_to_uid["VIRTUAL"] + tensor_name_to_uid[scale_tensor_name] + 7000
+                                        : tensor_name_to_uid[output_tensor_name];
+    auto afterScaleTensor = tensor_create_with_offset(
+        outputDataType, outputUID, output_dim, output_stride, isOutputVirtual, false, offsetTensor);  // is virtual
+
+    // Define the scale descriptor
+    auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a Scale Node.
+    auto scale_op = binary_pw_op_create(prevBlockOutputTensor, scaleTensor, afterScaleTensor, scaleDesc);
+
+    ops.push_back(std::move(scale_op));
+    return afterScaleTensor;
+}
+
+static cudnn_frontend::Tensor
+createSoftmaxForward(int64_t b,
+                     int64_t h,
+                     int64_t s_q,
+                     int64_t s_kv,
+                     std::vector<cudnn_frontend::Operation>& ops,
+                     cudnn_frontend::Tensor& prevBlockOutputTensor,
+                     bool isTraining) {
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t afterReduction_dim[4]    = {b, h, s_q, 1};
+    int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1};
+
+    // max (x) (M tensor)
+    auto afterMaxReductionTensor =
+        tensor_create(CUDNN_DATA_FLOAT,
+                      tensor_name_to_uid["M"],
+                      afterReduction_dim,
+                      afterReduction_stride,
+                      !isTraining,
+                      false);  // not virtual if training is true, virtual if training is false
+    // x - max(x)
+    auto afterSubtractionTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                tensor_name_to_uid["VIRTUAL"] + 151,
+                                                afterBMM1_dim,
+                                                afterBMM1_stride,
+                                                true,
+                                                false);  // is virtual
+    // e^(x - max(x))
+    auto afterExponentTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                             tensor_name_to_uid["VIRTUAL"] + 152,
+                                             afterBMM1_dim,
+                                             afterBMM1_stride,
+                                             true,
+                                             false);  // is virtual;
+    // sum (e^(x - max(x))) (Z tensor)
+    auto zTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                 tensor_name_to_uid["Z"],
+                                 afterReduction_dim,
+                                 afterReduction_stride,
+                                 true,
+                                 false);  // is virtual
+    // 1 / sum (e^(x - max(x))) (Z_INV tensor)
+    auto zInvTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                    tensor_name_to_uid["Z_INV"],
+                                    afterReduction_dim,
+                                    afterReduction_stride,
+                                    !isTraining,
+                                    false);  // not virtual if training is true, virtual if training is false
+    // Final softmax output (After exponent * Z_INV)
+    auto beforeDropoutTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                             tensor_name_to_uid["VIRTUAL"] + 153,
+                                             afterBMM1_dim,
+                                             afterBMM1_stride,
+                                             true,
+                                             false);  // is virtual
+
+    // Define the reduction descriptor
+    auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+                                .build();
+    std::cout << reductionMaxDesc.describe() << std::endl;
+
+    // Create a reduction max Node.
+    auto reductionMax_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(prevBlockOutputTensor)
+                               .setyDesc(afterMaxReductionTensor)
+                               .setreductionDesc(reductionMaxDesc)
+                               .build();
+    std::cout << reductionMax_op.describe() << std::endl;
+
+    // Define the subtract descriptor
+    auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+
+    // Create a subtract Node.
+    auto subtract_op =
+        binary_pw_op_create(prevBlockOutputTensor, afterMaxReductionTensor, afterSubtractionTensor, subtractDesc);
+
+    // Define the exponent descriptor
+    auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+
+    // Create a exponent Node.
+    auto exponent_op = unary_pw_op_create(afterSubtractionTensor, afterExponentTensor, exponentDesc);
+
+    // Define the reduction descriptor
+    auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+    std::cout << reductionAddDesc.describe() << std::endl;
+
+    // Create a reduction add Node.
+    auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(afterExponentTensor)
+                               .setyDesc(zTensor)
+                               .setreductionDesc(reductionAddDesc)
+                               .build();
+
+    std::cout << reductionAdd_op.describe() << std::endl;
+
+    // Define the reciprocal descriptor
+    auto reciprocalDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_RECIPROCAL);
+
+    // Create a reciprocal Node.
+    auto reciprocal_op = unary_pw_op_create(zTensor, zInvTensor, reciprocalDesc);
+
+    // Define the pw multiply descriptor
+    auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply Node.
+    auto mutliply_op = binary_pw_op_create(afterExponentTensor, zInvTensor, beforeDropoutTensor, multiplyDesc);
+
+    ops.push_back(std::move(reductionMax_op));
+    ops.push_back(std::move(subtract_op));
+    ops.push_back(std::move(exponent_op));
+    ops.push_back(std::move(reductionAdd_op));
+    ops.push_back(std::move(reciprocal_op));
+    ops.push_back(std::move(mutliply_op));
+
+    return beforeDropoutTensor;
+}
+
+static cudnn_frontend::Tensor
+createDropoutForward(int64_t b,
+                     int64_t h,
+                     int64_t s_q,
+                     int64_t s_kv,
+                     double probability,
+                     std::vector<cudnn_frontend::Operation>& ops,
+                     cudnn_frontend::Tensor& beforeDropoutTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Dropout DAG constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    // mask for the dropout
+    auto dropoutMaskTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                           tensor_name_to_uid["VIRTUAL"] + 250,
+                                           afterBMM1_dim,
+                                           afterBMM1_stride,
+                                           true,
+                                           false);  // is virtual
+    auto dropoutSeedTensor = tensor_create(
+        CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_SEED"], scale_dim, scale_stride, false, false);  // is by value
+    auto dropoutOffsetTensor = tensor_create(
+        CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_OFFSET"], scale_dim, scale_stride, false, false);  // is by value
+
+    // after dropout tensor befor scale
+    auto beforeDropoutScaleTensor = cudnn_frontend::TensorBuilder()
+                                        .setDim(4, afterBMM1_dim)
+                                        .setStride(4, afterBMM1_stride)
+                                        .setId(tensor_name_to_uid["VIRTUAL"] + 201)
+                                        .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                        .setDataType(CUDNN_DATA_FLOAT)
+                                        .setVirtual(true)
+                                        .setByValue(false)
+                                        .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                        .build();
+    // scale after dropout
+    auto scaleDropoutTensor = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["DROPOUT_SCALE"], scale_dim, scale_stride, false, true);  // is by value
+    // after Scale
+    auto afterDropout_before_quan_S = tensor_create(CUDNN_DATA_FLOAT,
+                                                    tensor_name_to_uid["VIRTUAL"] + 202,
+                                                    afterBMM1_dim,
+                                                    afterBMM1_stride,
+                                                    true,
+                                                    false);  // is virtual
+
+    // Define the reduction descriptor
+    auto rngDesc = cudnn_frontend::RngDescBuilder()
+                       .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                       .setBernoulliDistProbability(1.0 - probability)
+                       .build();
+    std::cout << rngDesc.describe() << std::endl;
+
+    // Create a rng Node.
+    auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+                      .setyDesc(dropoutMaskTensor)
+                      .setSeedDesc(dropoutSeedTensor)
+                      .setOffsetDesc(dropoutOffsetTensor)
+                      .setRngDesc(rngDesc)
+                      .build();
+
+    std::cout << rng_op.describe() << std::endl;
+
+    // Define the multiply mask descriptor
+    auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto maskMul_op =
+        binary_pw_op_create(beforeDropoutTensor, dropoutMaskTensor, beforeDropoutScaleTensor, maskMulDesc);
+
+    // Define the multiply scale descriptor
+    auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto scaleMul_op =
+        binary_pw_op_create(beforeDropoutScaleTensor, scaleDropoutTensor, afterDropout_before_quan_S, scaleMulDesc);
+
+    ops.push_back(std::move(rng_op));
+    ops.push_back(std::move(maskMul_op));
+    ops.push_back(std::move(scaleMul_op));
+
+    return afterDropout_before_quan_S;
+}
+
+static cudnn_frontend::Tensor
+createDropoutBackward(int64_t b,
+                      int64_t h,
+                      int64_t s_q,
+                      int64_t s_kv,
+                      double probability,
+                      std::vector<cudnn_frontend::Operation>& ops,
+                      cudnn_frontend::Tensor& beforeDropoutTensor,
+                      cudnn_frontend::Tensor& dropoutMaskTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Dropout DAG constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    auto dropoutSeedTensor = tensor_create(
+        CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_SEED"], scale_dim, scale_stride, false, false);  // is by value
+    auto dropoutOffsetTensor = tensor_create(
+        CUDNN_DATA_INT64, tensor_name_to_uid["DROPOUT_OFFSET"], scale_dim, scale_stride, false, false);  // is by value
+
+    // after dropout tensor befor scale
+    auto beforeDropoutScaleTensor = cudnn_frontend::TensorBuilder()
+                                        .setDim(4, afterBMM1_dim)
+                                        .setStride(4, afterBMM1_stride)
+                                        .setId(tensor_name_to_uid["VIRTUAL"] + 201)
+                                        .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                        .setDataType(CUDNN_DATA_FLOAT)
+                                        .setVirtual(true)
+                                        .setByValue(false)
+                                        .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                        .build();
+    // scale after dropout (1 / (1 - p))
+    auto scaleDropoutTensor = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["DROPOUT_SCALE"], scale_dim, scale_stride, false, true);  // is by value
+    // after Scale
+    auto afterDropout_before_quan_S = tensor_create(CUDNN_DATA_FLOAT,
+                                                    tensor_name_to_uid["VIRTUAL"] + 202,
+                                                    afterBMM1_dim,
+                                                    afterBMM1_stride,
+                                                    true,
+                                                    false);  // is virtual
+
+    // Define the reduction descriptor
+    auto rngDesc = cudnn_frontend::RngDescBuilder()
+                       .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                       .setBernoulliDistProbability(1.0 - probability)
+                       .build();
+    std::cout << rngDesc.describe() << std::endl;
+
+    // Create a rng Node.
+    auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+                      .setyDesc(dropoutMaskTensor)
+                      .setSeedDesc(dropoutSeedTensor)
+                      .setOffsetDesc(dropoutOffsetTensor)
+                      .setRngDesc(rngDesc)
+                      .build();
+
+    std::cout << rng_op.describe() << std::endl;
+
+    // Define the multiply mask descriptor
+    auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto maskMul_op =
+        binary_pw_op_create(beforeDropoutTensor, dropoutMaskTensor, beforeDropoutScaleTensor, maskMulDesc);
+
+    // Define the multiply scale descriptor
+    auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto scaleMul_op =
+        binary_pw_op_create(beforeDropoutScaleTensor, scaleDropoutTensor, afterDropout_before_quan_S, scaleMulDesc);
+
+    ops.push_back(std::move(rng_op));
+    ops.push_back(std::move(maskMul_op));
+    ops.push_back(std::move(scaleMul_op));
+
+    return afterDropout_before_quan_S;
+}
+
+static cudnn_frontend::Tensor
+createSoftmaxBackward(int64_t b,
+                      int64_t h,
+                      int64_t s_q,
+                      int64_t s_kv,
+                      std::vector<cudnn_frontend::Operation>& ops,
+                      cudnn_frontend::Tensor& dyTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Softmax backward constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t dx_dim[4]    = {b, h, s_q, s_kv};
+    int64_t dx_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t M_Z_dim[4]    = {b, h, s_q, 1};
+    int64_t M_Z_stride[4] = {h * s_q, s_q, 1, 1};
+
+    // creating all tensors
+    auto MTensor =
+        tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["M"], M_Z_dim, M_Z_stride, false, false);  // not virtual
+    auto ZInvTensor =
+        tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["Z_INV"], M_Z_dim, M_Z_stride, false, false);  // not virtual
+    auto dxAfterSubtractionTensor = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 252, dx_dim, dx_stride, true, false);  // is virtual
+    auto dxAfterExponentiation = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 253, dx_dim, dx_stride, true, false);  // is virtual
+    auto dxBeforeDropout_QKt_Tensor = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 254, dx_dim, dx_stride, true, false);  // is virtual
+
+    // creating all ops
+    // sub (dy - M)
+    auto subtractionDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+    auto subtractionOp   = binary_pw_op_create(dyTensor, MTensor, dxAfterSubtractionTensor, subtractionDesc);
+
+    // Define the exponent descriptor
+    auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+
+    // Create a exponent Node. (exp(dy - M))
+    auto exponentOp = unary_pw_op_create(dxAfterSubtractionTensor, dxAfterExponentiation, exponentDesc);
+
+    // Define the pw multiply descriptor
+    auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply Node.
+    auto mutliplyOp = binary_pw_op_create(dxAfterExponentiation, ZInvTensor, dxBeforeDropout_QKt_Tensor, multiplyDesc);
+
+    ops.push_back(std::move(subtractionOp));
+    ops.push_back(std::move(exponentOp));
+    ops.push_back(std::move(mutliplyOp));
+
+    return dxBeforeDropout_QKt_Tensor;
+}
+
+static cudnn_frontend::Tensor
+createQKBMM(int64_t b,
+            int64_t h,
+            int64_t s_q,
+            int64_t s_kv,
+            int64_t d,
+            MHA_Layout layout,
+            cudnnDataType_t tensorType,
+            std::vector<cudnn_frontend::Operation>& ops,
+            const cudnn_frontend::Tensor& qTensor,
+            const cudnn_frontend::Tensor& kTensor,
+            const cudnn_frontend::Tensor& mnkOverride,
+            std::shared_ptr<cudnn_frontend::Tensor>& QKVRaggedOffsetTensor) {
+    // Creates the necessary tensor descriptors
+    int64_t k_transpose_dim[4] = {b, h, d, s_kv};
+    int64_t k_transpose_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, k_transpose_stride, layout, MHA_Matrix::K_Matrix_Transpose);
+
+    int64_t s_dim[4] = {b, h, s_q, s_kv};
+    int64_t s_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, s_stride, layout, MHA_Matrix::S_Matrix);
+
+    auto kTransposeTensor = tensor_create_with_offset(tensorType,
+                                                      tensor_name_to_uid["K_TRANSPOSE"],
+                                                      k_transpose_dim,
+                                                      k_transpose_stride,
+                                                      false,
+                                                      false,
+                                                      QKVRaggedOffsetTensor);  // is virtual
+
+    // first GEMM output
+    auto afterQKTensor =
+        tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 1, s_dim, s_stride, true, false);  // is virtual
+
+    // Define the matmul desc
+    auto matmulDesc =
+        cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).setPaddingValue(-2000000).build();
+    std::cout << matmulDesc.describe() << std::endl;
+
+    // Create reshape node for K -> K.T
+    auto reshape_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                          .setxDesc(kTensor)
+                          .setyDesc(kTransposeTensor)
+                          .build();
+
+    std::cout << reshape_op.describe() << std::endl;
+
+    // Create a matmul Node
+    auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                        .setaMatDesc(qTensor)
+                        .setbMatDesc(kTransposeTensor)
+                        .setcMatDesc(afterQKTensor)
+                        .setmOverrideDesc(mnkOverride)
+                        .setnOverrideDesc(mnkOverride)
+                        .setmatmulDesc(matmulDesc)
+                        .build();
+
+    std::cout << matmulOp.describe() << std::endl;
+
+    ops.push_back(std::move(reshape_op));
+    ops.push_back(std::move(matmulOp));
+
+    return afterQKTensor;
+}
+
+static cudnn_frontend::Tensor
+createSVBMM(int64_t b,
+            int64_t h,
+            int64_t s_q,
+            int64_t s_kv,
+            int64_t d,
+            MHA_Layout layout,
+            cudnnDataType_t tensorType,
+            std::vector<cudnn_frontend::Operation>& ops,
+            const cudnn_frontend::Tensor& softmaxTensor,
+            const cudnn_frontend::Tensor& mnkOverride,
+            std::shared_ptr<cudnn_frontend::Tensor>& QKVRaggedOffsetTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "BMM2 op constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t v_dim[4] = {b, h, s_kv, d};
+    int64_t v_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, v_stride, layout, MHA_Matrix::V_Matrix);
+
+    int64_t o_dim[4] = {b, h, s_q, d};
+    int64_t o_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+    auto vTensor = tensor_create_with_offset(
+        tensorType, tensor_name_to_uid["V"], v_dim, v_stride, false, false, QKVRaggedOffsetTensor);
+    // second fprop GEMM output
+    auto oTensor =
+        tensor_create(tensorType, tensor_name_to_uid["VIRTUAL"] + 300, o_dim, o_stride, true, false);  // is virtual
+
+    // Define the matmul desc
+    auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+    std::cout << matmulDesc.describe() << std::endl;
+
+    // Create a matmul Node
+    auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                        .setaMatDesc(softmaxTensor)
+                        .setbMatDesc(vTensor)
+                        .setcMatDesc(oTensor)
+                        .setmOverrideDesc(mnkOverride)
+                        .setkOverrideDesc(mnkOverride)
+                        .setmatmulDesc(matmulDesc)
+                        .build();
+
+    std::cout << matmulOp.describe() << std::endl;
+
+    ops.push_back(std::move(matmulOp));
+
+    return oTensor;
+}
+
+static cudnn_frontend::Tensor
+createSdOBMM(int64_t b,
+             int64_t h,
+             int64_t s_q,
+             int64_t s_kv,
+             int64_t d,
+             cudnnDataType_t tensorType,
+             std::vector<cudnn_frontend::Operation>& ops,
+             const cudnn_frontend::Tensor& softmaxTensor,
+             const cudnn_frontend::Tensor& dOTensor,
+             const cudnn_frontend::Tensor& mnkOverride) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "BMM2 op constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t s_dim_transpose[4]    = {b, h, s_kv, s_q};
+    int64_t s_stride_transpose[4] = {h * s_kv * s_q, s_kv * s_q, 1, s_kv};
+
+    int64_t v_dim[4]    = {b, h, s_kv, d};
+    int64_t v_stride[4] = {h * s_kv * d, d, h * d, 1};
+
+    auto sTransposeTensor = tensor_create(tensorType,
+                                          tensor_name_to_uid["VIRTUAL"] + 499,
+                                          s_dim_transpose,
+                                          s_stride_transpose,
+                                          true,
+                                          false);  // is virtual
+    // S.T * dO
+    auto dVTensor_before_dequan_S = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 500, v_dim, v_stride, true, false);  // is virtual
+
+    // Create reshape node for softmax -> softmax.T
+    auto reshape_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                          .setxDesc(softmaxTensor)
+                          .setyDesc(sTransposeTensor)
+                          .build();
+
+    // Define the matmul desc
+    auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).setPaddingValue(0).build();
+    std::cout << matmulDesc.describe() << std::endl;
+
+    // Create a matmul Node
+    auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                        .setaMatDesc(sTransposeTensor)
+                        .setbMatDesc(dOTensor)
+                        .setcMatDesc(dVTensor_before_dequan_S)
+                        .setmOverrideDesc(mnkOverride)
+                        .setkOverrideDesc(mnkOverride)
+                        .setmatmulDesc(matmulDesc)
+                        .build();
+
+    std::cout << matmulOp.describe() << std::endl;
+
+    ops.push_back(std::move(reshape_op));
+    ops.push_back(std::move(matmulOp));
+
+    return dVTensor_before_dequan_S;
+}
+
+static cudnn_frontend::Tensor
+createdOVBMM(int64_t b,
+             int64_t h,
+             int64_t s_q,
+             int64_t s_kv,
+             int64_t d,
+             MHA_Layout layout,
+             cudnnDataType_t tensorType,
+             std::vector<cudnn_frontend::Operation>& ops,
+             const cudnn_frontend::Tensor& dOTensor,
+             const cudnn_frontend::Tensor& mnkOverride,
+             std::shared_ptr<cudnn_frontend::Tensor>& QKVRaggedOffsetTensor) {
+    // Creates the necessary tensor descriptors
+    int64_t v_dim[4] = {b, h, s_kv, d};
+    int64_t v_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, v_stride, layout, MHA_Matrix::V_Matrix);
+
+    int64_t v_transpose_dim[4] = {b, h, d, s_kv};
+    int64_t v_transpose_stride[4];
+    v_transpose_stride[0] = v_stride[0];
+    v_transpose_stride[1] = v_stride[1];
+    v_transpose_stride[2] = v_stride[3];
+    v_transpose_stride[3] = v_stride[2];
+
+    int64_t s_dim[4] = {b, h, s_q, s_kv};
+    int64_t s_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, s_stride, layout, MHA_Matrix::S_Matrix);
+
+    auto vTensor = tensor_create_with_offset(
+        tensorType, tensor_name_to_uid["V"], v_dim, v_stride, false, false, QKVRaggedOffsetTensor);
+    auto vTransposeTensor = tensor_create_with_offset(tensorType,
+                                                      tensor_name_to_uid["V_TRANSPOSE"],
+                                                      v_transpose_dim,
+                                                      v_transpose_stride,
+                                                      false,
+                                                      false,
+                                                      QKVRaggedOffsetTensor);  // is virtual
+
+    // dO * V.T
+    auto afterdOVTensor = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 600, s_dim, s_stride, true, false);  // is virtual
+
+    // Define the matmul desc
+    auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).setPaddingValue(0).build();
+    std::cout << matmulDesc.describe() << std::endl;
+
+    // Create reshape node for V -> V.T
+    auto reshape_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                          .setxDesc(vTensor)
+                          .setyDesc(vTransposeTensor)
+                          .build();
+
+    std::cout << reshape_op.describe() << std::endl;
+
+    // Create a matmul Node
+    auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                        .setaMatDesc(dOTensor)
+                        .setbMatDesc(vTransposeTensor)
+                        .setcMatDesc(afterdOVTensor)
+                        .setmOverrideDesc(mnkOverride)
+                        .setnOverrideDesc(mnkOverride)
+                        .setmatmulDesc(matmulDesc)
+                        .build();
+
+    std::cout << matmulOp.describe() << std::endl;
+
+    ops.push_back(std::move(reshape_op));
+    ops.push_back(std::move(matmulOp));
+
+    return afterdOVTensor;
+}
+
+static cudnn_frontend::Tensor
+createdOAndORowReductionChain(int64_t b,
+                              int64_t h,
+                              int64_t s_q,
+                              int64_t s_kv,
+                              int64_t d,
+                              MHA_Layout layout,
+                              std::vector<cudnn_frontend::Operation>& ops,
+                              const cudnn_frontend::Tensor& O_after_dequan,
+                              const cudnn_frontend::Tensor& dO_after_dequan,
+                              const cudnn_frontend::Tensor& dropoutScale_dOVt_OdO_Tensor) {
+    int64_t o_dim[4] = {b, h, s_q, d};
+    int64_t o_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+    int64_t o_dim_row_sum[4]        = {b, h, s_q, 1};
+    int64_t o_dim_row_sum_stride[4] = {s_q * h, s_q, 1, 1};
+
+    auto O_dO_after_pointwise_multiply = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 700, o_dim, o_stride, true, false);  // is virtual
+    auto O_dO_after_dropout_scale = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 701, o_dim, o_stride, true, false);  // is virtual
+    auto O_dO_after_rowsum = tensor_create(CUDNN_DATA_FLOAT,
+                                           tensor_name_to_uid["VIRTUAL"] + 702,
+                                           o_dim_row_sum,
+                                           o_dim_row_sum_stride,
+                                           true,
+                                           false);  // is virtual
+
+    // Define the pw multiply descriptor
+    auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply Node.
+    auto mutliply_op =
+        binary_pw_op_create(O_after_dequan, dO_after_dequan, O_dO_after_pointwise_multiply, multiplyDesc);
+
+    // Create multiply node with dropout scale
+    auto dropout_scale_multiply_op = binary_pw_op_create(
+        O_dO_after_pointwise_multiply, dropoutScale_dOVt_OdO_Tensor, O_dO_after_dropout_scale, multiplyDesc);
+
+    // Define the reduction descriptor
+    auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+    std::cout << reductionAddDesc.describe() << std::endl;
+
+    // Create a reduction add Node.
+    auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(O_dO_after_dropout_scale)
+                               .setyDesc(O_dO_after_rowsum)
+                               .setreductionDesc(reductionAddDesc)
+                               .build();
+
+    std::cout << reductionAdd_op.describe() << std::endl;
+
+    ops.push_back(std::move(mutliply_op));
+    ops.push_back(std::move(dropout_scale_multiply_op));
+    ops.push_back(std::move(reductionAdd_op));
+
+    return O_dO_after_rowsum;
+}
+
+static cudnn_frontend::Tensor
+createBiasSubtractionSoftmaxMulChain(int64_t b,
+                                     int64_t h,
+                                     int64_t s_q,
+                                     int64_t s_kv,
+                                     int64_t d,
+                                     MHA_Layout layout,
+                                     std::vector<cudnn_frontend::Operation>& ops,
+                                     const cudnn_frontend::Tensor& dS_after_dropout,
+                                     const cudnn_frontend::Tensor& AfterDropout_before_quan_S,
+                                     const cudnn_frontend::Tensor& O_dO_after_rowsum,
+                                     const cudnn_frontend::Tensor& attnScale) {
+    // TODO: Add dropout
+    int64_t o_dim[4] = {b, h, s_q, s_kv};
+    int64_t o_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::S_Matrix);
+    auto dS_minus_O_dO = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 800, o_dim, o_stride, true, false);  // is virtual
+    auto AfterAttnScale_before_dS = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 801, o_dim, o_stride, true, false);  // is virtual
+    auto S_mul_dS_minus_O_dO = tensor_create(
+        CUDNN_DATA_FLOAT, tensor_name_to_uid["VIRTUAL"] + 802, o_dim, o_stride, true, false);  // is virtual
+
+    // Define the pw subtraction descriptor
+    auto subDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+
+    // Create a subtraction Node.
+    auto sub_op = binary_pw_op_create(dS_after_dropout, O_dO_after_rowsum, dS_minus_O_dO, subDesc);
+
+    // Define the pw multiplication descriptor
+    auto multiplyDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // dS_minus_O_dO * attnScale
+    auto mutliply_attn_scale_op = binary_pw_op_create(dS_minus_O_dO, attnScale, AfterAttnScale_before_dS, multiplyDesc);
+
+    // AfterDropout_before_quan_S * AfterAttnScale_before_dS
+    auto mutliply_op =
+        binary_pw_op_create(AfterDropout_before_quan_S, AfterAttnScale_before_dS, S_mul_dS_minus_O_dO, multiplyDesc);
+
+    ops.push_back(std::move(sub_op));
+    ops.push_back(std::move(mutliply_attn_scale_op));
+    ops.push_back(std::move(mutliply_op));
+
+    return S_mul_dS_minus_O_dO;
+}
+
+static cudnn_frontend::Tensor
+createdSKBMM(int64_t b,
+             int64_t h,
+             int64_t s_q,
+             int64_t s_kv,
+             int64_t d,
+             std::vector<cudnn_frontend::Operation>& ops,
+             const cudnn_frontend::Tensor& dSTensor,
+             const cudnn_frontend::Tensor& kTensor,
+             const cudnn_frontend::Tensor& mnkOverride) {
+    CUDNN_FRONTEND_UNUSED(s_q);
+    // Creates the necessary tensor descriptors
+    int64_t after_dSK_dim[4]    = {b, h, s_kv, d};
+    int64_t after_dSK_stride[4] = {h * s_kv * d, d, h * d, 1};
+    // dS * K
+    auto After_dS_K = tensor_create(CUDNN_DATA_FLOAT,
+                                    tensor_name_to_uid["VIRTUAL"] + 875,
+                                    after_dSK_dim,
+                                    after_dSK_stride,
+                                    true,
+                                    false);  // is virtual
+
+    // Define the matmul desc
+    auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).setPaddingValue(0).build();
+    std::cout << matmulDesc.describe() << std::endl;
+
+    // Create a matmul Node
+    auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                        .setaMatDesc(dSTensor)
+                        .setbMatDesc(kTensor)
+                        .setcMatDesc(After_dS_K)
+                        .setmOverrideDesc(mnkOverride)
+                        .setkOverrideDesc(mnkOverride)
+                        .setmatmulDesc(matmulDesc)
+                        .build();
+
+    std::cout << matmulOp.describe() << std::endl;
+
+    ops.push_back(std::move(matmulOp));
+
+    return After_dS_K;
+}
+
+static cudnn_frontend::Tensor
+createdSQBMM(int64_t b,
+             int64_t h,
+             int64_t s_q,
+             int64_t s_kv,
+             int64_t d,
+             MHA_Layout layout,
+             std::vector<cudnn_frontend::Operation>& ops,
+             const cudnn_frontend::Tensor& dSTensor,
+             const cudnn_frontend::Tensor& qTensor,
+             const cudnn_frontend::Tensor& mnkOverride) {
+    // Creates the necessary tensor descriptors
+    int64_t dS_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, dS_stride, layout, MHA_Matrix::S_Matrix);
+
+    int64_t dS_transpose_dim[4] = {b, h, s_kv, s_q};
+    int64_t dS_transpose_stride[4];
+    dS_transpose_stride[0] = dS_stride[0];
+    dS_transpose_stride[1] = dS_stride[1];
+    dS_transpose_stride[2] = dS_stride[3];
+    dS_transpose_stride[3] = dS_stride[2];
+
+    int64_t after_dSTranspose_Q_dim[4]    = {b, h, s_kv, d};
+    int64_t after_dSTranspose_Q_stride[4] = {h * s_kv * d, d, h * d, 1};
+
+    auto dSTransposeTensor = tensor_create(CUDNN_DATA_FP8_E5M2,
+                                           tensor_name_to_uid["VIRTUAL"] + 650,
+                                           dS_transpose_dim,
+                                           dS_transpose_stride,
+                                           true,
+                                           false);  // is virtual
+
+    // dS.T * Q
+    auto After_dSTranspose_Q = tensor_create(CUDNN_DATA_FLOAT,
+                                             tensor_name_to_uid["VIRTUAL"] + 651,
+                                             after_dSTranspose_Q_dim,
+                                             after_dSTranspose_Q_stride,
+                                             true,
+                                             false);  // is virtual
+
+    // Create reshape node for V -> V.T
+    auto reshape_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                          .setxDesc(dSTensor)
+                          .setyDesc(dSTransposeTensor)
+                          .build();
+
+    std::cout << reshape_op.describe() << std::endl;
+
+    // Define the matmul desc
+    auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).setPaddingValue(0).build();
+    std::cout << matmulDesc.describe() << std::endl;
+
+    // Create a matmul Node
+    auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                        .setaMatDesc(dSTransposeTensor)
+                        .setbMatDesc(qTensor)
+                        .setcMatDesc(After_dSTranspose_Q)
+                        .setmOverrideDesc(mnkOverride)
+                        .setkOverrideDesc(mnkOverride)
+                        .setmatmulDesc(matmulDesc)
+                        .build();
+
+    std::cout << matmulOp.describe() << std::endl;
+
+    ops.push_back(std::move(reshape_op));
+    ops.push_back(std::move(matmulOp));
+
+    return After_dSTranspose_Q;
+}
+
+void
+run_fp8_flash_mha_fprop(int64_t b,
+                        int64_t h,
+                        int64_t s_q,
+                        int64_t s_kv,
+                        int64_t d,
+                        float attnScale,
+                        bool isTraining,
+                        float dropoutProbability,
+                        MHA_Layout layout,
+                        void* devPtrQKV,
+                        void* devPtrM,
+                        void* devPtrZInv,
+                        void* devPtrO,
+                        void* devPtrDropoutSeed,
+                        void* devPtrDropoutOffset,
+                        void* devPtrDescaleQ,
+                        void* devPtrDescaleK,
+                        void* devPtrDescaleV,
+                        void* devPtrDescaleS,
+                        void* devPtrScaleS,
+                        void* devPtrScaleO,
+                        void* devPtrAmaxO,
+                        void* devPtrAmaxS,
+                        void* devPtrQKVRaggedOffset,
+                        void* devPtrORaggedOffset,
+                        void* devPtrMNKOverride,
+                        cudnnDataType_t tensorType) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // FP8 BERT Flash Attention only runs on cudnn v8.9 and above and only on Hopper
+        if (check_device_arch_newer_than("hopper") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "Run FP8 BERT Flash Attention: Sample requires Hopper or above GPU");
+        }
+
+        std::vector<cudnn_frontend::Operation const*> all_ops;
+        std::vector<cudnn_frontend::Operation> ops;
+        std::set<std::pair<uint64_t, void*>> data_ptrs;
+
+        cudnn_frontend::throw_if(dropoutProbability != 0.0f && !isTraining,
+                                 "Dropout probability should be 0.0f for inference mode",
+                                 CUDNN_STATUS_BAD_PARAM);
+        cudnn_frontend::throw_if(
+            dropoutProbability == 1.0f, "Dropout probability cannot be 1.0", CUDNN_STATUS_BAD_PARAM);
+
+        // Ragged tensors have b + 1 elements
+        int64_t raggedDim[4]    = {b + 1, 1, 1, 1};
+        int64_t raggedStride[4] = {1, 1, 1, 1};
+        // Create offset tensors
+        auto QKVOffsetTensor =
+            tensor_create(CUDNN_DATA_INT32, tensor_name_to_uid["QKV_RAGGED"], raggedDim, raggedStride, false, false);
+        auto ORaggedOffsetTensor =
+            tensor_create(CUDNN_DATA_INT32, tensor_name_to_uid["O_RAGGED"], raggedDim, raggedStride, false, false);
+
+        int64_t seqlen_dim[4]    = {b, 1, 1, 1};
+        int64_t seqlen_stride[4] = {1, 1, 1, 1};
+        // Create override tensors
+        auto seqlenMNKTensor = tensor_create(
+            CUDNN_DATA_INT32, tensor_name_to_uid["MNK_OVERRIDE"], seqlen_dim, seqlen_stride, false, false);
+
+        // Create shared ptrs to ragged offset tensors for multiple tensors to use ragged offset
+        std::shared_ptr<cudnn_frontend::Tensor> QKVRaggedOffsetTensorPtr =
+            std::make_shared<cudnn_frontend::Tensor>(std::move(QKVOffsetTensor));
+        std::shared_ptr<cudnn_frontend::Tensor> ORaggedOffsetTensorPtr =
+            std::make_shared<cudnn_frontend::Tensor>(std::move(ORaggedOffsetTensor));
+
+        // Create Q and K tensors that are used in different places
+        int64_t q_dim[4] = {b, h, s_q, d};
+        int64_t q_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+        int64_t k_dim[4] = {b, h, s_kv, d};
+        int64_t k_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix);
+
+        auto qTensor = tensor_create_with_offset(
+            tensorType, tensor_name_to_uid["Q"], q_dim, q_stride, false, false, QKVRaggedOffsetTensorPtr);
+        auto kTensor = tensor_create_with_offset(
+            tensorType, tensor_name_to_uid["K"], k_dim, k_stride, false, false, QKVRaggedOffsetTensorPtr);
+
+        // Q * K.T
+        auto afterQKTensor = createQKBMM(
+            b, h, s_q, s_kv, d, layout, tensorType, ops, qTensor, kTensor, seqlenMNKTensor, QKVRaggedOffsetTensorPtr);
+
+        // QK.T * attn scale
+        auto AfterAttnScale_before_dequan_Q_tensor = createScale(afterQKTensor,     // input tensor
+                                                                 "AttnScale",       // scale tensor
+                                                                 CUDNN_DATA_FLOAT,  // output tensor type
+                                                                 true,              // output is virtual
+                                                                 true,              // scale is by value
+                                                                 ops);
+
+        // QK.T * attn scale * dequant_Q
+        auto AfterAttnScale_before_dequan_K_tensor = createScale(AfterAttnScale_before_dequan_Q_tensor,  // input tensor
+                                                                 "descaleQ",                             // scale tensor
+                                                                 CUDNN_DATA_FLOAT,  // output tensor type
+                                                                 true,              // output is virtual
+                                                                 false,             // scale is by value
+                                                                 ops);
+
+        // QK.T * attn scale * dequant_Q * dequant_K
+        auto AfterAttnScale_tensor = createScale(AfterAttnScale_before_dequan_K_tensor,  // input tensor
+                                                 "descaleK",                             // scale tensor
+                                                 CUDNN_DATA_FLOAT,                       // output tensor type
+                                                 true,                                   // output is virtual
+                                                 false,                                  // scale is by value
+                                                 ops);
+
+        auto BeforeDropoutTensor = createSoftmaxForward(b, h, s_q, s_kv, ops, AfterAttnScale_tensor, isTraining);
+
+        auto AfterDropout_before_quan_S =
+            createDropoutForward(b, h, s_q, s_kv, dropoutProbability, ops, BeforeDropoutTensor);
+
+        // Amax for S
+        createAmax("amaxS", BeforeDropoutTensor, ops);
+
+        // After softmax * dropout * scale S -> fp8 input to next bmm with V
+        auto AfterMultiplyDropout = createScale(AfterDropout_before_quan_S,  // input tensor
+                                                "scaleS",                    // scale tensor
+                                                tensorType,                  // output tensor type
+                                                true,                        // output is virtual
+                                                false,                       // scale is by value
+                                                ops);
+
+        // After softmax * Dropout * V
+        auto OTensor_before_dequan_S_tensor = createSVBMM(b,
+                                                          h,
+                                                          s_q,
+                                                          s_kv,
+                                                          d,
+                                                          layout,
+                                                          tensorType,
+                                                          ops,
+                                                          AfterMultiplyDropout,
+                                                          seqlenMNKTensor,
+                                                          QKVRaggedOffsetTensorPtr);
+
+        // O * dequant_S
+        auto OTensor_before_dequan_V_tensor = createScale(OTensor_before_dequan_S_tensor,  // input tensor
+                                                          "descaleS",                      // scale tensor
+                                                          CUDNN_DATA_FLOAT,                // output tensor type
+                                                          true,                            // output is virtual
+                                                          false,                           // scale is by value
+                                                          ops);
+
+        // O * dequant_S * dequant_V
+        auto OTensor_before_quan_O_tensor = createScale(OTensor_before_dequan_V_tensor,  // input tensor
+                                                        "descaleV",                      // scale tensor
+                                                        CUDNN_DATA_FLOAT,                // output tensor type
+                                                        true,                            // output is virtual
+                                                        false,                           // scale is by value
+                                                        ops);
+
+        // O * dequant_S * dequant_V * scale O
+        auto OTensor = createScaleWithOffset(OTensor_before_quan_O_tensor,  // input tensor
+                                             "scaleO",                      // scale tensor
+                                             tensorType,                    // output tensor type
+                                             false,                         // output not virtual
+                                             false,                         // scale is by value
+                                             ops,
+                                             ORaggedOffsetTensorPtr,  // ragged offset
+                                             "O");
+
+        // Amax for O
+        createAmax("amaxO", OTensor_before_quan_O_tensor, ops);
+
+        std::cout << "Total ops created: " << ops.size() << std::endl;
+
+        for (unsigned int i = 0; i < ops.size(); i++) {
+            all_ops.push_back(&ops[i]);
+        }
+
+        // Create an Operation Graph
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(all_ops.size(), all_ops.data())
+                           .build();
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_instant"}, opGraph, ::allowAllConfig, filtered_configs, true);
+
+        if (filtered_configs.size() == 0) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_NOT_SUPPORTED, "run_mha_fprop: No config returned by the heuristics");
+        }
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+
+        void* devPtrQ = (void*)devPtrQKV;                                      // q points to the top of qkv
+        void* devPtrK = (void*)(static_cast<int8_t*>(devPtrQKV) + h * d);      // k is at an offset of h * d
+        void* devPtrV = (void*)(static_cast<int8_t*>(devPtrQKV) + 2 * h * d);  // v is at an offset of 2 * h * d
+
+        float dropoutScale = 1.0f / (1.0f - dropoutProbability);  // 1 / (1 - p)
+
+        // add all the data pointers to be used in the variant pack
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["Q"], devPtrQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["K"], devPtrK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["K_TRANSPOSE"], devPtrK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["V"], devPtrV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["AttnScale"], &attnScale));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_SCALE"], &dropoutScale));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_SEED"], devPtrDropoutSeed));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_OFFSET"], devPtrDropoutOffset));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["O"], devPtrO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleQ"], devPtrDescaleQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleK"], devPtrDescaleK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleV"], devPtrDescaleV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleS"], devPtrDescaleS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaleS"], devPtrScaleS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaleO"], devPtrScaleO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["amaxO"], devPtrAmaxO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["amaxS"], devPtrAmaxS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["QKV_RAGGED"], devPtrQKVRaggedOffset));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["O_RAGGED"], devPtrORaggedOffset));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["MNK_OVERRIDE"], devPtrMNKOverride));
+
+        // If training, then we need to write out M and Z_INV
+        if (isTraining) {
+            data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["M"], devPtrM));
+            data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["Z_INV"], devPtrZInv));
+        }
+
+        auto variantPack =
+            cudnn_frontend::VariantPackBuilder().setWorkspacePointer(workspace_ptr).setDataPointers(data_ptrs).build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GH100 cards (cudnn Version >= 8900)
+        if (!((prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8900)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GH100 (cuDNN >= 8900) GPUs" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+
+void
+run_fp8_flash_mha_bprop(int64_t b,
+                        int64_t h,
+                        int64_t s_q,
+                        int64_t s_kv,
+                        int64_t d,
+                        float attnScale,
+                        float dropoutProbability,
+                        MHA_Layout layout,
+                        void* devPtrQKV,
+                        void* devPtrM,
+                        void* devPtrZInv,
+                        void* devPtrO,
+                        void* devPtrdO,
+                        void* devPtrdQKV,
+                        void* devPtrDropoutSeed,
+                        void* devPtrDropoutOffset,
+                        void* devPtrDescaleQ,
+                        void* devPtrDescaleK,
+                        void* devPtrDescaleV,
+                        void* devPtrDescaleO,
+                        void* devPtrDescaledO,
+                        void* devPtrDescaleS,
+                        void* devPtrDescaledS,
+                        void* devPtrScaleS,
+                        void* devPtrScaledS,
+                        void* devPtrScaledQ,
+                        void* devPtrScaledK,
+                        void* devPtrScaledV,
+                        void* devPtrAmaxdS,
+                        void* devPtrAmaxdQ,
+                        void* devPtrAmaxdK,
+                        void* devPtrAmaxdV,
+                        void* devPtrQKVRaggedOffset,
+                        void* devPtrORaggedOffset,
+                        void* devPtrMNKOverride,
+                        cudnnDataType_t tensorType) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // FP8 BERT Flash Attention only runs on cudnn v8.9 and above and only on Hopper
+        if (check_device_arch_newer_than("hopper") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "Run FP8 BERT Flash Attention: Sample requires Hopper or above GPU");
+        }
+
+        std::vector<cudnn_frontend::Operation const*> all_ops;
+        std::vector<cudnn_frontend::Operation> ops;
+        std::set<std::pair<uint64_t, void*>> data_ptrs;
+
+        cudnn_frontend::throw_if(
+            dropoutProbability == 1.0f, "Dropout probability cannot be 1.0", CUDNN_STATUS_BAD_PARAM);
+
+        // Ragged tensors have b + 1 elements
+        int64_t raggedDim[4]    = {b + 1, 1, 1, 1};
+        int64_t raggedStride[4] = {1, 1, 1, 1};
+        // Create offset tensors
+        auto QKVOffsetTensor =
+            tensor_create(CUDNN_DATA_INT32, tensor_name_to_uid["QKV_RAGGED"], raggedDim, raggedStride, false, false);
+        auto ORaggedOffsetTensor =
+            tensor_create(CUDNN_DATA_INT32, tensor_name_to_uid["O_RAGGED"], raggedDim, raggedStride, false, false);
+
+        // Create shared ptrs to ragged offset tensors for multiple tensors to use ragged offset
+        std::shared_ptr<cudnn_frontend::Tensor> QKVRaggedOffsetTensorPtr =
+            std::make_shared<cudnn_frontend::Tensor>(std::move(QKVOffsetTensor));
+        std::shared_ptr<cudnn_frontend::Tensor> ORaggedOffsetTensorPtr =
+            std::make_shared<cudnn_frontend::Tensor>(std::move(ORaggedOffsetTensor));
+
+        // Create Q and K tensors that are used in different places
+        int64_t q_dim[4] = {b, h, s_q, d};
+        int64_t q_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+        int64_t k_dim[4] = {b, h, s_kv, d};
+        int64_t k_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix);
+
+        auto qTensor = tensor_create_with_offset(
+            tensorType, tensor_name_to_uid["Q"], q_dim, q_stride, false, false, QKVRaggedOffsetTensorPtr);
+        auto kTensor = tensor_create_with_offset(
+            tensorType, tensor_name_to_uid["K"], k_dim, k_stride, false, false, QKVRaggedOffsetTensorPtr);
+
+        int64_t scale_dim[4]    = {1, 1, 1, 1};
+        int64_t scale_stride[4] = {1, 1, 1, 1};
+
+        // Create attnScale tensor for multiple ops to use
+        auto attnScaleTensor = tensor_create(
+            CUDNN_DATA_FLOAT, tensor_name_to_uid["AttnScale"], scale_dim, scale_stride, false, true);  // is by value
+
+        // Create descale Q K dO dS global tensors since they are used in multiple places
+        auto descaleQTensor =
+            tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["descaleQ"], scale_dim, scale_stride, false, false);
+        auto descaleKTensor =
+            tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["descaleK"], scale_dim, scale_stride, false, false);
+        auto descaledOTensor =
+            tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["descaledO"], scale_dim, scale_stride, false, false);
+        auto descaledSTensor =
+            tensor_create(CUDNN_DATA_FLOAT, tensor_name_to_uid["descaledS"], scale_dim, scale_stride, false, false);
+
+        int64_t seqlen_dim[4]    = {b, 1, 1, 1};
+        int64_t seqlen_stride[4] = {1, 1, 1, 1};
+        // Create MNK override tensor
+        auto seqlenMNKTensor = tensor_create(
+            CUDNN_DATA_INT32, tensor_name_to_uid["MNK_OVERRIDE"], seqlen_dim, seqlen_stride, false, false);
+
+        float dropoutScale          = 1.0f / (1.0f - dropoutProbability);  // 1 / (1 - p)
+        float dropoutScale_dOVt_OdO = 1.0f - dropoutProbability;           // (1 - p)
+
+        int64_t O_dim[4] = {b, h, s_q, d};
+        int64_t O_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, O_stride, layout, MHA_Matrix::O_Matrix);
+        // Create O and loss tensor
+        auto OTensor = tensor_create_with_offset(
+            tensorType, tensor_name_to_uid["O"], O_dim, O_stride, false, false, ORaggedOffsetTensorPtr);
+        // dO is used in multiple places and E5M2
+        auto dOTensor = tensor_create_with_offset(
+            CUDNN_DATA_FP8_E5M2, tensor_name_to_uid["dO"], O_dim, O_stride, false, false, ORaggedOffsetTensorPtr);
+
+        // Q * K.T
+        auto afterQKTensor = createQKBMM(
+            b, h, s_q, s_kv, d, layout, tensorType, ops, qTensor, kTensor, seqlenMNKTensor, QKVRaggedOffsetTensorPtr);
+
+        // QK.T * attn scale
+        auto AfterAttnScale_before_dequan_Q_tensor = createScale(afterQKTensor,     // input tensor
+                                                                 attnScaleTensor,   // scale tensor
+                                                                 CUDNN_DATA_FLOAT,  // output tensor type
+                                                                 true,              // output is virtual
+                                                                 true,              // scale is by value
+                                                                 ops,
+                                                                 1999 /*UID offset*/);
+
+        // QK.T * attn scale * dequant_Q
+        auto AfterAttnScale_before_dequan_K_tensor = createScale(AfterAttnScale_before_dequan_Q_tensor,  // input tensor
+                                                                 descaleQTensor,                         // scale tensor
+                                                                 CUDNN_DATA_FLOAT,  // output tensor type
+                                                                 true,              // output is virtual
+                                                                 false,             // scale is by value
+                                                                 ops,
+                                                                 2000 /*UID offset*/);
+
+        // QK.T * attn scale * dequant_Q * dequant_K
+        auto AfterAttnScale_tensor = createScale(AfterAttnScale_before_dequan_K_tensor,  // input tensor
+                                                 descaleKTensor,                         // scale tensor
+                                                 CUDNN_DATA_FLOAT,                       // output tensor type
+                                                 true,                                   // output is virtual
+                                                 false,                                  // scale is by value
+                                                 ops,
+                                                 2001 /*UID offset*/);
+
+        auto beforeDropout_QKt_Tensor = createSoftmaxBackward(b, h, s_q, s_kv, ops, AfterAttnScale_tensor);
+
+        int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+        int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+        // mask for the dropout. Used in different places
+        auto dropoutMaskTensor = tensor_create(CUDNN_DATA_FLOAT,
+                                               tensor_name_to_uid["VIRTUAL"] + 200,
+                                               afterBMM1_dim,
+                                               afterBMM1_stride,
+                                               true,
+                                               false);  // is virtual
+
+        auto AfterDropout_before_quan_S = createDropoutBackward(
+            b, h, s_q, s_kv, dropoutProbability, ops, beforeDropout_QKt_Tensor, dropoutMaskTensor);
+
+        // After softmax * scale S -> fp8 input to next bmm with V
+        auto AfterMultiply = createScale(AfterDropout_before_quan_S,  // input tensor
+                                         "scaleS",                    // scale tensor
+                                         tensorType,                  // output tensor type
+                                         true,                        // output is virtual
+                                         false,                       // scale is by value
+                                         ops);
+
+        // After softmax * dO
+        auto dVTensor_before_dequan_S =
+            createSdOBMM(b, h, s_q, s_kv, d, tensorType, ops, AfterMultiply, dOTensor, seqlenMNKTensor);
+
+        // O * dequant_S
+        auto dVTensor_before_dequan_dO = createScale(dVTensor_before_dequan_S,  // input tensor
+                                                     "descaleS",                // scale tensor
+                                                     CUDNN_DATA_FLOAT,          // output tensor type
+                                                     true,                      // output is virtual
+                                                     false,                     // scale is by value
+                                                     ops);
+
+        // O * dequant_S * dequant_dO
+        auto dVTensor_before_quan_dV = createScale(dVTensor_before_dequan_dO,  // input tensor
+                                                   descaledOTensor,            // scale tensor
+                                                   CUDNN_DATA_FLOAT,           // output tensor type
+                                                   true,                       // output is virtual
+                                                   false,                      // scale is by value
+                                                   ops,
+                                                   2002 /*UID offset*/);
+
+        // O * dequant_S * dequant_dO * scale dV
+        auto dVTensor = createScaleWithOffset(dVTensor_before_quan_dV,  // input tensor
+                                              "scaledV",                // scale tensor
+                                              CUDNN_DATA_FP8_E5M2,      // output tensor type
+                                              false,                    // output not virtual
+                                              false,                    // scale is by value
+                                              ops,
+                                              QKVRaggedOffsetTensorPtr,  // ragged offset
+                                              "dV" /*Output tensor name*/);
+
+        // Amax for dV
+        createAmax("amaxdV", dVTensor_before_quan_dV, ops);
+
+        auto dS_before_dequan_dO_Tensor = createdOVBMM(
+            b, h, s_q, s_kv, d, layout, tensorType, ops, dOTensor, seqlenMNKTensor, QKVRaggedOffsetTensorPtr);
+
+        // dS * dequant_dO
+        auto dS_before_dequan_V = createScale(dS_before_dequan_dO_Tensor,  // input tensor
+                                              descaledOTensor,             // scale tensor
+                                              CUDNN_DATA_FLOAT,            // output tensor type
+                                              true,                        // output is virtual
+                                              false,                       // scale is by value
+                                              ops,
+                                              2003 /*UID offset*/);
+
+        // O * dequant_S * dequant_dV
+        auto dS_after_dequan = createScale(dS_before_dequan_V,  // input tensor
+                                           "descaleV",          // scale tensor
+                                           CUDNN_DATA_FLOAT,    // output tensor type
+                                           true,                // output is virtual
+                                           false,               // scale is by value
+                                           ops);
+
+        // RNG Multiply
+        auto beforeDropoutScale_dOVt_Tensor = tensor_create(CUDNN_DATA_FLOAT,
+                                                            tensor_name_to_uid["VIRTUAL"] + 350,
+                                                            afterBMM1_dim,
+                                                            afterBMM1_stride,
+                                                            true,
+                                                            false);  // is virtual
+        // After dropout mask and scale
+        auto dS_after_dropout = tensor_create(CUDNN_DATA_FLOAT,
+                                              tensor_name_to_uid["VIRTUAL"] + 351,
+                                              afterBMM1_dim,
+                                              afterBMM1_stride,
+                                              true,
+                                              false);  // is virtual
+
+        // Define the multiply mask descriptor
+        auto mulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+        // Create a multiply mask Node.
+        auto maskMul_op =
+            binary_pw_op_create(dS_after_dequan, dropoutMaskTensor, beforeDropoutScale_dOVt_Tensor, mulDesc);
+
+        ops.push_back(std::move(maskMul_op));
+
+        // scale after dropout for dO and O chain
+        auto dropoutScale_dOVt_OdO_Tensor = tensor_create(tensorType,
+                                                          tensor_name_to_uid["DROPOUT_SCALE_dOVt_OdO"],
+                                                          scale_dim,
+                                                          scale_stride,
+                                                          false,
+                                                          true);  // is by value
+
+        // Create a multiply dropout scale Node.
+        auto mul_dropout_scale_op = binary_pw_op_create(
+            beforeDropoutScale_dOVt_Tensor, dropoutScale_dOVt_OdO_Tensor, dS_after_dropout, mulDesc);
+
+        ops.push_back(std::move(mul_dropout_scale_op));
+
+        // O * dequant_O
+        auto O_after_dequan_Tensor = createScale(OTensor,           // input tensor
+                                                 "descaleO",        // scale tensor
+                                                 CUDNN_DATA_FLOAT,  // output tensor type
+                                                 true,              // output is virtual
+                                                 false,             // scale is by value
+                                                 ops);
+
+        // dO * dequant_dO
+        auto dO_after_dequan_Tensor = createScale(dOTensor,          // input tensor
+                                                  descaledOTensor,   // scale tensor
+                                                  CUDNN_DATA_FLOAT,  // output tensor type
+                                                  true,              // output is virtual
+                                                  false,             // scale is by value
+                                                  ops,
+                                                  2004 /*UID offset*/);
+
+        // row reduction sum[(dO * dequant_dO) * (O * dequant_O) * (1 - p)]
+        auto O_dO_after_rowsum = createdOAndORowReductionChain(b,
+                                                               h,
+                                                               s_q,
+                                                               s_kv,
+                                                               d,
+                                                               layout,
+                                                               ops,
+                                                               O_after_dequan_Tensor,
+                                                               dO_after_dequan_Tensor,
+                                                               dropoutScale_dOVt_OdO_Tensor);
+
+        // (dS_after_dropout - O_dO_after_rowsum) * AfterDropout_before_quan_S * attnScale
+        auto S_mul_dS_minus_O_dO = createBiasSubtractionSoftmaxMulChain(b,
+                                                                        h,
+                                                                        s_q,
+                                                                        s_kv,
+                                                                        d,
+                                                                        layout,
+                                                                        ops,
+                                                                        dS_after_dropout,
+                                                                        AfterDropout_before_quan_S,
+                                                                        O_dO_after_rowsum,
+                                                                        attnScaleTensor);
+
+        // S_mul_dS_minus_O_dO * scaledS
+        auto S_mul_dS_minus_O_dO_after_quan_dS = createScale(S_mul_dS_minus_O_dO,  // input tensor
+                                                             "scaledS",            // scale tensor
+                                                             CUDNN_DATA_FP8_E5M2,  // output tensor type
+                                                             true,                 // output is virtual
+                                                             false,                // scale is by value
+                                                             ops);
+
+        // Amax for dS
+        createAmax("amaxdS", S_mul_dS_minus_O_dO, ops);
+
+        // dS @ K
+        auto After_dS_K =
+            createdSKBMM(b, h, s_q, s_kv, d, ops, S_mul_dS_minus_O_dO_after_quan_dS, kTensor, seqlenMNKTensor);
+
+        // (dS * K) * descale dS
+        auto After_dS_K_before_dequan_K = createScale(After_dS_K,        // input tensor
+                                                      descaledSTensor,   // scale tensor
+                                                      CUDNN_DATA_FLOAT,  // output tensor type
+                                                      true,              // output is virtual
+                                                      false,             // scale is by value
+                                                      ops,
+                                                      2006 /*UID offset*/);
+
+        // (dS * K) * descale dS * descale K
+        auto After_dS_K_before_quan_dQ = createScale(After_dS_K_before_dequan_K,  // input tensor
+                                                     descaleKTensor,              // scale tensor
+                                                     CUDNN_DATA_FLOAT,            // output tensor type
+                                                     true,                        // output is virtual
+                                                     false,                       // scale is by value
+                                                     ops,
+                                                     2007 /*UID offset*/);
+
+        // (dS * K) * descale dS * descale K * scale dQ
+        auto dQ = createScaleWithOffset(After_dS_K_before_quan_dQ,  // input tensor
+                                        "scaledQ",                  // scale tensor
+                                        CUDNN_DATA_FP8_E5M2,        // output tensor type
+                                        false,                      // output not virtual
+                                        false,                      // scale is by value
+                                        ops,
+                                        QKVRaggedOffsetTensorPtr,  // ragged offset
+                                        "dQ");
+
+        // Amax for dq. IMPORTANT: amaxdQ actually contains the amax for dQKV.
+        createAmax("amaxdQ", After_dS_K_before_quan_dQ, ops);
+
+        // dS.T @ Q
+        auto After_dSTranspose_Q =
+            createdSQBMM(b, h, s_q, s_kv, d, layout, ops, S_mul_dS_minus_O_dO_after_quan_dS, qTensor, seqlenMNKTensor);
+
+        // (dS.T * Q) * descale dS
+        auto After_dSTranspose_Q_before_dequan_Q = createScale(After_dSTranspose_Q,  // input tensor
+                                                               descaledSTensor,      // scale tensor
+                                                               CUDNN_DATA_FLOAT,     // output tensor type
+                                                               true,                 // output is virtual
+                                                               false,                // scale is by value
+                                                               ops,
+                                                               2009 /*UID offset*/);
+
+        // (dS.T * Q) * descale dS * descale Q
+        auto After_dSTranspose_Q_before_quan_dK = createScale(After_dSTranspose_Q_before_dequan_Q,  // input tensor
+                                                              descaleQTensor,                       // scale tensor
+                                                              CUDNN_DATA_FLOAT,  // output tensor type
+                                                              true,              // output is virtual
+                                                              false,             // scale is by value
+                                                              ops,
+                                                              2010 /*UID offset*/);
+
+        // (dS.T * Q) * descale dS * descale Q * scale dK
+        auto dK = createScaleWithOffset(After_dSTranspose_Q_before_quan_dK,  // input tensor
+                                        "scaledK",                           // scale tensor
+                                        CUDNN_DATA_FP8_E5M2,                 // output tensor type
+                                        false,                               // output not virtual
+                                        false,                               // scale is by value
+                                        ops,
+                                        QKVRaggedOffsetTensorPtr,  // ragged offset
+                                        "dK");
+
+        // Amax for dK
+        createAmax("amaxdK", After_dSTranspose_Q_before_quan_dK, ops);
+
+        std::cout << "Total ops created: " << ops.size() << std::endl;
+
+        for (unsigned int i = 0; i < ops.size(); i++) {
+            all_ops.push_back(&ops[i]);
+        }
+
+        // Create an Operation Graph
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(all_ops.size(), all_ops.data())
+                           .build();
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_instant"}, opGraph, ::allowAllConfig, filtered_configs, true);
+
+        if (filtered_configs.size() == 0) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_NOT_SUPPORTED, "run_mha_bprop: No config returned by the heuristics");
+        }
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+
+        void* devPtrQ = (void*)devPtrQKV;                                      // q points to the top of qkv
+        void* devPtrK = (void*)(static_cast<int8_t*>(devPtrQKV) + h * d);      // k is at an offset of h * d
+        void* devPtrV = (void*)(static_cast<int8_t*>(devPtrQKV) + 2 * h * d);  // v is at an offset of 2 * h * d
+
+        void* devPtrdQ = (void*)devPtrdQKV;                                      // dQ points to the top of dQKV
+        void* devPtrdK = (void*)(static_cast<int8_t*>(devPtrdQKV) + h * d);      // dK is at an offset of h * d
+        void* devPtrdV = (void*)(static_cast<int8_t*>(devPtrdQKV) + 2 * h * d);  // dV is at an offset of 2 * h * d
+
+        // add all the data pointers to be used in the variant pack
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["Q"], devPtrQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["K"], devPtrK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["K_TRANSPOSE"], devPtrK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["V"], devPtrV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["V_TRANSPOSE"], devPtrV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["dQ"], devPtrdQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["dK"], devPtrdK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["dV"], devPtrdV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["dO"], devPtrdO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["AttnScale"], &attnScale));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_SCALE"], &dropoutScale));
+        data_ptrs.emplace(
+            std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_SCALE_dOVt_OdO"], &dropoutScale_dOVt_OdO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_SEED"], devPtrDropoutSeed));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["DROPOUT_OFFSET"], devPtrDropoutOffset));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["M"], devPtrM));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["Z_INV"], devPtrZInv));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["O"], devPtrO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleQ"], devPtrDescaleQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleK"], devPtrDescaleK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleV"], devPtrDescaleV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleS"], devPtrDescaleS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaledS"], devPtrDescaledS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaleO"], devPtrDescaleO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["descaledO"], devPtrDescaledO));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaleS"], devPtrScaleS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaledS"], devPtrScaledS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaledQ"], devPtrScaledQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaledK"], devPtrScaledK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["scaledV"], devPtrScaledV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["amaxdS"], devPtrAmaxdS));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["amaxdQ"], devPtrAmaxdQ));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["amaxdK"], devPtrAmaxdK));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["amaxdV"], devPtrAmaxdV));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["QKV_RAGGED"], devPtrQKVRaggedOffset));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["O_RAGGED"], devPtrORaggedOffset));
+        data_ptrs.emplace(std::pair<uint64_t, void*>(tensor_name_to_uid["MNK_OVERRIDE"], devPtrMNKOverride));
+
+        auto variantPack =
+            cudnn_frontend::VariantPackBuilder().setWorkspacePointer(workspace_ptr).setDataPointers(data_ptrs).build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GH100 cards (cudnn Version >= 8900)
+        if (!((prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8900)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GH100 (cuDNN >= 8900) GPUs" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fp8_flash_mha_sample.h b/third_party/cudnn-frontend/samples/legacy_samples/fp8_flash_mha_sample.h
new file mode 100644
index 00000000..00a80bb0
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fp8_flash_mha_sample.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn.h>
+#include "./utils/fp16_dev.h"
+#include "./utils/fp16_emu.h"
+#include "./utils/helpers.h"
+
+#if (CUDNN_VERSION >= 8900)
+void
+run_fp8_flash_mha_fprop(int64_t b,
+                        int64_t h,
+                        int64_t s_q,
+                        int64_t s_kv,
+                        int64_t d,
+                        float attnScale,
+                        bool isTraining,
+                        float dropoutProbability,
+                        MHA_Layout layout,
+                        void* devPtrQKV,
+                        void* devPtrM,
+                        void* devPtrZInv,
+                        void* devPtrO,
+                        void* devPtrDropoutSeed,
+                        void* devPtrDropoutOffset,
+                        void* devPtrDescaleQ,
+                        void* devPtrDescaleK,
+                        void* devPtrDescaleV,
+                        void* devPtrDescaleS,
+                        void* devPtrScaleS,
+                        void* devPtrScaleO,
+                        void* devPtrAmaxO,
+                        void* devPtrAmaxS,
+                        void* devPtrQKVRaggedOffset,
+                        void* devPtrORaggedOffset,
+                        void* devPtrMNKOverride,
+                        cudnnDataType_t tensorType);
+
+#endif
+
+#if (CUDNN_VERSION >= 8900)
+void
+run_fp8_flash_mha_bprop(int64_t b,
+                        int64_t h,
+                        int64_t s_q,
+                        int64_t s_kv,
+                        int64_t d,
+                        float attnScale,
+                        float dropoutProbability,
+                        MHA_Layout layout,
+                        void* devPtrQKV,
+                        void* devPtrM,
+                        void* devPtrZInv,
+                        void* devPtrO,
+                        void* devPtrdO,
+                        void* devPtrdQKV,
+                        void* devPtrDropoutSeed,
+                        void* devPtrDropoutOffset,
+                        void* devPtrDescaleQ,
+                        void* devPtrDescaleK,
+                        void* devPtrDescaleV,
+                        void* devPtrDescaleO,
+                        void* devPtrDescaledO,
+                        void* devPtrDescaleS,
+                        void* devPtrDescaledS,
+                        void* devPtrScaleS,
+                        void* devPtrScaledS,
+                        void* devPtrScaledQ,
+                        void* devPtrScaledK,
+                        void* devPtrScaledV,
+                        void* devPtrAmaxdS,
+                        void* devPtrAmaxdQ,
+                        void* devPtrAmaxdK,
+                        void* devPtrAmaxdV,
+                        void* devPtrQKVRaggedOffset,
+                        void* devPtrORaggedOffset,
+                        void* devPtrMNKOverride,
+                        cudnnDataType_t tensorType);
+
+#endif
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fp8_sample.cpp b/third_party/cudnn-frontend/samples/legacy_samples/fp8_sample.cpp
new file mode 100644
index 00000000..b39083fb
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fp8_sample.cpp
@@ -0,0 +1,750 @@
+#include "fp8_sample.h"
+#include <cudnn_frontend.h>
+#include "./utils/error_util.h"
+
+using namespace cudnn_frontend;
+
+ExecutionPlan_v8
+get_exec_plan_from_heuristics(OperationGraph_v8&& opGraph, cudnnHandle_t handle) {
+    auto heuristics = EngineHeuristicsBuilder().setOperationGraph(opGraph).setHeurMode(CUDNN_HEUR_MODE_INSTANT).build();
+
+    auto& engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+
+    auto plan_builder = [&]() -> ExecutionPlan {
+        for (auto& ecfg : engine_config) {
+            try {
+                auto plan = ExecutionPlanBuilder().setHandle(handle).setEngineConfig(ecfg, opGraph.getTag()).build();
+                return plan;
+            } catch (cudnnException& e) {
+                continue;
+            }
+        }
+        return ExecutionPlanBuilder().setHandle(handle).setEngineConfig(engine_config[0], opGraph.getTag()).build();
+    };
+
+    return plan_builder();
+}
+
+#if (CUDNN_VERSION >= 8600)
+void
+run_fp8_conv_scale(int64_t* x_dim,
+                   int64_t* w_dim,
+                   int64_t* y_dim,
+                   int64_t* scale_dim,
+                   cudnnDataType_t dataType,
+                   int convDim,
+                   int64_t* conv_padA,
+                   int64_t* conv_dilationA,
+                   int64_t* conv_strideA,
+                   void* devPtrX,
+                   void* devPtrW,
+                   void* devPtrY,
+                   void* devPtrScale) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("hopper") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_fp8_conv_scale: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        ::generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStrides(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        ::generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStrides(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        ::generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStrides(4, stride)
+                                   .setId('y')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .build();
+
+        auto afterScaleTensor =
+            TensorBuilder().cloneFrom(afterConvTensor, 'a').setVirtual(false).setDataType(dataType).build();
+
+        ::generateStrides(scale_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto scaleTensor = TensorBuilder()
+                               .setDim(4, scale_dim)
+                               .setStrides(4, stride)
+                               .setId('s')  // after conv
+                               .setAlignment(16)
+                               .setDataType(CUDNN_DATA_FLOAT)
+                               .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << scaleTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = ConvDescBuilder()
+                            .setDataType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setNDims(convDim)
+                            .setStrides(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(conv_op.getOutputTensor())
+                            .setbDesc(scaleTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution scale
+        std::array<Operation const*, 2> ops = {&conv_op, &scale_op};
+
+        auto opGraph = OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();
+
+        auto plan = get_exec_plan_from_heuristics(std::move(opGraph), handle_);
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrW, devPtrY, devPtrScale};
+        int64_t uids[]    = {'x', 'w', 'a', 's'};
+        auto variantPack  = VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(4, data_ptrs)
+                               .setUids(4, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnnException& e) {
+        // this example is only for Hopper cards
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major < 9 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with fp8 inputs is only supported on Hopper or later" << std::endl;
+            return;
+        }
+
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_fp8_conv_descale_descale_amax_scale(int64_t* x_dim,
+                                        int64_t* w_dim,
+                                        int64_t* y_dim,
+                                        int64_t* r_dim,
+                                        int64_t* scale_dim,
+                                        cudnnDataType_t dataType,
+                                        int convDim,
+                                        int64_t* conv_padA,
+                                        int64_t* conv_dilationA,
+                                        int64_t* conv_strideA,
+                                        void* devPtrX,
+                                        void* devPtrW,
+                                        void* devPtrR,
+                                        void* devPtrOutput,
+                                        void* devPtrDescale1,
+                                        void* devPtrDescale2,
+                                        void* devPtrScale) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("hopper") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "run_fp8_conv_descale_descale_amax_scale: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        ::generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStrides(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        ::generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStrides(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        ::generateStrides(r_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto amaxTensor = TensorBuilder()
+                              .setDim(4, r_dim)
+                              .setStrides(4, stride)
+                              .setId('r')  // output
+                              .setAlignment(16)
+                              .setDataType(CUDNN_DATA_FLOAT)
+                              .build();
+
+        ::generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStrides(4, stride)
+                                   .setId('y')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .build();
+
+        auto afterDescale1Tensor = TensorBuilder().cloneFrom(afterConvTensor, 'a').build();
+
+        auto afterDescale2Tensor = TensorBuilder().cloneFrom(afterConvTensor, 'b').build();
+
+        auto fp8OutputTensor =
+            TensorBuilder().cloneFrom(afterConvTensor, 'c').setVirtual(false).setDataType(dataType).build();
+
+        ::generateStrides(scale_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto descaleTensor1 = TensorBuilder()
+                                  .setDim(4, scale_dim)
+                                  .setStrides(4, stride)
+                                  .setId('s')
+                                  .setAlignment(16)
+                                  .setDataType(CUDNN_DATA_FLOAT)
+                                  .build();
+
+        auto descaleTensor2 = TensorBuilder().cloneFrom(descaleTensor1, 't').build();
+
+        auto scaleTensor = TensorBuilder().cloneFrom(descaleTensor1, 'u').build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << scaleTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the reduction descriptor
+        auto redunctionDesc =
+            ReductionDescBuilder().setMathPrecision(CUDNN_DATA_FLOAT).setReductionOp(CUDNN_REDUCE_TENSOR_AMAX).build();
+        std::cout << redunctionDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = ConvDescBuilder()
+                            .setDataType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setNDims(convDim)
+                            .setStrides(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto descale_op1 = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                               .setxDesc(afterConvTensor)
+                               .setbDesc(descaleTensor1)
+                               .setyDesc(afterDescale1Tensor)
+                               .setpwDesc(scaleDesc)
+                               .build();
+        std::cout << descale_op1.describe() << std::endl;
+
+        auto descale_op2 = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                               .setxDesc(afterDescale1Tensor)
+                               .setbDesc(descaleTensor2)
+                               .setyDesc(afterDescale2Tensor)
+                               .setpwDesc(scaleDesc)
+                               .build();
+        std::cout << descale_op2.describe() << std::endl;
+
+        auto scale_op = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(afterDescale2Tensor)
+                            .setbDesc(scaleTensor)
+                            .setyDesc(fp8OutputTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a reduction add Node.
+        auto reduction_op = OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                .setxDesc(afterDescale2Tensor)
+                                .setyDesc(amaxTensor)
+                                .setreductionDesc(redunctionDesc)
+                                .build();
+        std::cout << reduction_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution descale descale amax scale
+        std::array<Operation const*, 5> ops = {&conv_op, &descale_op1, &descale_op2, &scale_op, &reduction_op};
+
+        auto opGraph = OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();
+
+        auto plan = get_exec_plan_from_heuristics(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrW, devPtrR, devPtrDescale1, devPtrDescale2, devPtrScale, devPtrOutput};
+        int64_t uids[]    = {'x', 'w', 'r', 's', 't', 'u', 'c'};
+        auto variantPack  = VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(7, data_ptrs)
+                               .setUids(7, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnnException& e) {
+        // this example is only for Hopper cards
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major < 9 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with fp8 inputs is only supported on Hopper or later" << std::endl;
+            return;
+        }
+
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_tranpose_scale_convert_fp16_fp8_amax(int64_t* x_dim,
+                                         int64_t* y_dim,
+                                         int64_t* r_dim,
+                                         int64_t* scale_dim,
+                                         cudnnDataType_t dataType,
+                                         void* devPtrX,
+                                         void* devPtrR,
+                                         void* devPtrOutput,
+                                         void* devPtrScale) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("hopper") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "run_tranpose_scale_convert_fp16_fp8_amax: Sample requires Ampere or above GPU");
+        }
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        ::generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStrides(4, stride)
+                           .setId('x')
+                           .setAlignment(16)              // 16B alignment is needed to run a tensor core engine
+                           .setDataType(CUDNN_DATA_HALF)  // Half as input
+                           .build();
+
+        ::generateStrides(scale_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto scaleTensor = TensorBuilder()
+                               .setDim(4, scale_dim)
+                               .setStrides(4, stride)
+                               .setId('s')
+                               .setAlignment(16)
+                               .setDataType(CUDNN_DATA_FLOAT)
+                               .build();
+
+        ::generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterScaleTensor = TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStrides(4, stride)
+                                    .setId('a')  // after transpose + convert
+                                    .setAlignment(16)
+                                    .setDataType(CUDNN_DATA_FLOAT)  // Transpose + convert to FP8
+                                    .setVirtual()
+                                    .build();
+
+        // Tranposed from NWHC to CHWN
+        ::generate4dTransposeStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvertTensor = TensorBuilder()
+                                      .setDim(4, y_dim)
+                                      .setStrides(4, stride)
+                                      .setId('y')  // after transpose + convert
+                                      .setAlignment(16)
+                                      .setDataType(dataType)  // Transpose + convert to FP8
+                                      .build();
+
+        ::generateStrides(r_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto amaxTensor = TensorBuilder()
+                              .setDim(4, r_dim)
+                              .setStrides(4, stride)
+                              .setId('r')  // output
+                              .setAlignment(16)
+                              .setDataType(CUDNN_DATA_FLOAT)
+                              .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << scaleTensor.describe() << std::endl;
+        std::cout << afterConvertTensor.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the convert descriptor
+        auto identityDesc =
+            PointWiseDescBuilder().setMode(CUDNN_POINTWISE_IDENTITY).setMathPrecision(CUDNN_DATA_FLOAT).build();
+        std::cout << identityDesc.describe() << std::endl;
+
+        // Define the reduction descriptor
+        auto redunctionDesc =
+            ReductionDescBuilder().setMathPrecision(CUDNN_DATA_FLOAT).setReductionOp(CUDNN_REDUCE_TENSOR_AMAX).build();
+        std::cout << redunctionDesc.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(xTensor)
+                            .setbDesc(scaleTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create transpose + convert node
+        auto convert_op = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                              .setxDesc(afterScaleTensor)
+                              .setyDesc(afterConvertTensor)
+                              .setpwDesc(identityDesc)
+                              .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a reduction add Node.
+        auto reduction_op = OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                .setxDesc(xTensor)
+                                .setyDesc(amaxTensor)
+                                .setreductionDesc(redunctionDesc)
+                                .build();
+        std::cout << reduction_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is scale transpose amax
+        std::array<Operation const*, 3> ops = {&scale_op, &convert_op, &reduction_op};
+
+        auto opGraph = OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();
+
+        auto plan = get_exec_plan_from_heuristics(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrR, devPtrScale, devPtrOutput};
+        int64_t uids[]    = {'x', 'r', 's', 'y'};
+        auto variantPack  = VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(4, data_ptrs)
+                               .setUids(4, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        checkCudaErr(cudaDeviceSynchronize());
+
+        throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnnException& e) {
+        // this example is only for Hopper cards
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major < 9 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with fp8 inputs is only supported on Hopper or later" << std::endl;
+            return;
+        }
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_fp8_dgrad_descale_descale_amax_scale(int64_t* dx_dim,
+                                         int64_t* w_dim,
+                                         int64_t* dy_dim,
+                                         int64_t* r_dim,
+                                         int64_t* scale_dim,
+                                         cudnnDataType_t dataType,
+                                         int convDim,
+                                         int64_t* conv_padA,
+                                         int64_t* conv_dilationA,
+                                         int64_t* conv_strideA,
+                                         void* devPtrdX,
+                                         void* devPtrW,
+                                         void* devPtrR,
+                                         void* devPtrdY,
+                                         void* devPtrDescale1,
+                                         void* devPtrDescale2,
+                                         void* devPtrScale) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (!is_hopper_arch() && !is_blackwell_computing_arch()) {
+            SKIP("run_fp8_dgrad_descale_descale_amax_scale: Sample requires Hopper or Blackwell Computing GPU");
+            return;
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+
+        ::generateStrides(dy_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto dyTensor = TensorBuilder()
+                            .setDim(4, dy_dim)
+                            .setStrides(4, stride)
+                            .setId('y')  // after conv
+                            .setAlignment(16)
+                            .setDataType(dataType)
+                            .build();
+
+        ::generate4dTransposeStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStrides(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        ::generateStrides(dx_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto dxTensor = TensorBuilder()
+                            .setDim(4, dx_dim)
+                            .setStrides(4, stride)
+                            .setId('x')
+                            .setVirtual()      // after dgrad is virtual
+                            .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                            .setDataType(CUDNN_DATA_FLOAT)
+                            .build();
+
+        ::generateStrides(scale_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto dyDescaleTensor = TensorBuilder()
+                                   .setDim(4, scale_dim)
+                                   .setStrides(4, stride)
+                                   .setId('s')
+                                   .setAlignment(16)
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .build();
+
+        auto afterDescale1Tensor = TensorBuilder().cloneFrom(dxTensor, 'a').build();
+
+        auto wDescaleTensor = TensorBuilder().cloneFrom(dyDescaleTensor, 't').build();
+
+        auto afterDescale2Tensor = TensorBuilder().cloneFrom(dxTensor, 'b').build();
+
+        auto dxScaleTensor = TensorBuilder().cloneFrom(dyDescaleTensor, 'u').build();
+
+        auto fp8OutputTensor = TensorBuilder().cloneFrom(dxTensor, 'c').setVirtual(false).setDataType(dataType).build();
+
+        ::generateStrides(r_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto amaxTensor = TensorBuilder()
+                              .setDim(4, r_dim)
+                              .setStrides(4, stride)
+                              .setId('r')  // output
+                              .setAlignment(16)
+                              .setDataType(CUDNN_DATA_FLOAT)
+                              .build();
+
+        std::cout << dxTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << dxScaleTensor.describe() << std::endl;
+        std::cout << dyTensor.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setMathPrecision(CUDNN_DATA_FLOAT).build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the reduction descriptor
+        auto redunctionDesc =
+            ReductionDescBuilder().setMathPrecision(CUDNN_DATA_FLOAT).setReductionOp(CUDNN_REDUCE_TENSOR_AMAX).build();
+        std::cout << redunctionDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = ConvDescBuilder()
+                            .setDataType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setNDims(convDim)
+                            .setStrides(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto dgrad_op = OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
+                            .setdyDesc(dyTensor)
+                            .setwDesc(wTensor)
+                            .setdxDesc(dxTensor)
+                            .setcDesc(convDesc)
+                            .setAlpha(alpha)
+                            .setBeta(beta)
+                            .build();
+        std::cout << dgrad_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto descale_op1 = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                               .setxDesc(dxTensor)
+                               .setbDesc(dyDescaleTensor)
+                               .setyDesc(afterDescale1Tensor)
+                               .setpwDesc(scaleDesc)
+                               .build();
+        std::cout << descale_op1.describe() << std::endl;
+
+        auto descale_op2 = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                               .setxDesc(afterDescale1Tensor)
+                               .setbDesc(wDescaleTensor)
+                               .setyDesc(afterDescale2Tensor)
+                               .setpwDesc(scaleDesc)
+                               .build();
+        std::cout << descale_op2.describe() << std::endl;
+
+        auto scale_op = OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(afterDescale2Tensor)
+                            .setbDesc(dxScaleTensor)
+                            .setyDesc(fp8OutputTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a reduction add Node.
+        auto reduction_op = OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                .setxDesc(afterDescale2Tensor)
+                                .setyDesc(amaxTensor)
+                                .setreductionDesc(redunctionDesc)
+                                .build();
+        std::cout << reduction_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is dgrad descale descale amax scale
+        std::array<Operation const*, 5> ops = {&dgrad_op, &descale_op1, &descale_op2, &scale_op, &reduction_op};
+
+        auto opGraph = OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();
+
+        auto plan = get_exec_plan_from_heuristics(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrdX, devPtrW, devPtrR, devPtrDescale1, devPtrDescale2, devPtrScale, devPtrdY};
+        int64_t uids[]    = {'c', 'w', 'r', 's', 't', 'u', 'y'};
+        auto variantPack  = VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(7, data_ptrs)
+                               .setUids(7, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnnException& e) {
+        // this example is only for Hopper cards
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major < 9 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with fp8 inputs is only supported on Hopper or later" << std::endl;
+            return;
+        }
+
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+#endif
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fp8_sample.h b/third_party/cudnn-frontend/samples/legacy_samples/fp8_sample.h
new file mode 100644
index 00000000..6c459cd4
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fp8_sample.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn.h>
+#include "./utils/fp16_dev.h"
+#include "./utils/fp16_emu.h"
+#include "./utils/helpers.h"
+
+void
+run_fp8_conv_scale(int64_t* x_dim,
+                   int64_t* w_dim,
+                   int64_t* y_dim,
+                   int64_t* scale_dim,
+                   cudnnDataType_t dataType,
+                   int convDim,
+                   int64_t* conv_padA,
+                   int64_t* conv_dilationA,
+                   int64_t* conv_strideA,
+                   void* devPtrX,
+                   void* devPtrW,
+                   void* devPtrY,
+                   void* devPtrScale);
+
+void
+run_fp8_conv_descale_descale_amax_scale(int64_t* x_dim,
+                                        int64_t* w_dim,
+                                        int64_t* y_dim,
+                                        int64_t* r_dim,
+                                        int64_t* scale_dim,
+                                        cudnnDataType_t dataType,
+                                        int convDim,
+                                        int64_t* conv_padA,
+                                        int64_t* conv_dilationA,
+                                        int64_t* conv_strideA,
+                                        void* devPtrX,
+                                        void* devPtrW,
+                                        void* devPtrR,
+                                        void* devPtrOutput,
+                                        void* devPtrDescale1,
+                                        void* devPtrDescale2,
+                                        void* devPtrScale);
+
+void
+run_tranpose_scale_convert_fp16_fp8_amax(int64_t* x_dim,
+                                         int64_t* y_dim,
+                                         int64_t* r_dim,
+                                         int64_t* scale_dim,
+                                         cudnnDataType_t dataType,
+                                         void* devPtrX,
+                                         void* devPtrR,
+                                         void* devPtrOutput,
+                                         void* devPtrScale);
+
+void
+run_fp8_dgrad_descale_descale_amax_scale(int64_t* dx_dim,
+                                         int64_t* w_dim,
+                                         int64_t* dy_dim,
+                                         int64_t* r_dim,
+                                         int64_t* scale_dim,
+                                         cudnnDataType_t dataType,
+                                         int convDim,
+                                         int64_t* conv_padA,
+                                         int64_t* conv_dilationA,
+                                         int64_t* conv_strideA,
+                                         void* devPtrdX,
+                                         void* devPtrW,
+                                         void* devPtrR,
+                                         void* devPtrdY,
+                                         void* devPtrDescale1,
+                                         void* devPtrDescale2,
+                                         void* devPtrScale);
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fused_mha_sample.cpp b/third_party/cudnn-frontend/samples/legacy_samples/fused_mha_sample.cpp
new file mode 100644
index 00000000..95a4b9b8
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fused_mha_sample.cpp
@@ -0,0 +1,1351 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "fused_mha_sample.h"
+#include <cudnn_frontend.h>
+#include "./utils/error_util.h"
+
+#define Q_ID 1
+#define K_ID 2
+#define V_ID 3
+#define O_ID 4
+#define S_ID 5
+#define B_ID 6
+#define D_CONST_ID 7
+#define S_CONST_ID 8
+#define Q_SEQLEN_ID 9
+#define K_SEQLEN_ID 10
+#define dQ_ID 11
+#define dK_ID 12
+#define dV_ID 13
+#define dO_ID 14
+#define MASK_VAL_ID 15
+#define dS_ID 16
+
+#define VIRTUAL_ID 20
+
+static bool
+allowAllConfig(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+static cudnn_frontend::Tensor
+tensor_create(cudnnDataType_t type,
+              int64_t id,
+              int64_t const* dim,
+              int64_t const* stride,
+              bool is_virtual,
+              bool is_value) {
+    int nbDims          = 4;
+    auto tensor_created = cudnn_frontend::TensorBuilder()
+                              .setDim(nbDims, dim)
+                              .setStride(nbDims, stride)
+                              .setId(id)
+                              .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                              .setDataType(type)
+                              .setVirtual(is_virtual)
+                              .setByValue(is_value)
+                              .build();
+    std::cout << tensor_created.describe() << std::endl;
+    return tensor_created;
+}
+
+static cudnn_frontend::PointWiseDesc
+pw_desc_create(cudnnDataType_t type, cudnnPointwiseMode_t mode) {
+    auto pw_desc_created = cudnn_frontend::PointWiseDescBuilder().setMode(mode).setComputeType(type).build();
+
+    std::cout << pw_desc_created.describe() << std::endl;
+    return pw_desc_created;
+}
+
+static cudnn_frontend::Operation
+unary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                   cudnn_frontend::Tensor const& yDesc,
+                   cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Operation
+binary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                    cudnn_frontend::Tensor const& bDesc,
+                    cudnn_frontend::Tensor const& yDesc,
+                    cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setbDesc(bDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+static cudnn_frontend::Operation
+ternary_pw_op_create(cudnn_frontend::Tensor const& xDesc,
+                     cudnn_frontend::Tensor const& bDesc,
+                     cudnn_frontend::Tensor const& tDesc,
+                     cudnn_frontend::Tensor const& yDesc,
+                     cudnn_frontend::PointWiseDesc const& pwDesc) {
+    auto pw_op_created = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(xDesc)
+                             .setbDesc(bDesc)
+                             .settDesc(tDesc)
+                             .setyDesc(yDesc)
+                             .setpwDesc(pwDesc)
+                             .build();
+    std::cout << pw_op_created.describe() << std::endl;
+    return pw_op_created;
+}
+
+#if (CUDNN_VERSION >= 8700)
+void
+run_b2b_batch_gemm(int64_t* q_dim,
+                   int64_t* k_dim,
+                   int64_t* s_dim,
+                   int64_t* v_dim,
+                   int64_t* o_dim,
+                   void* devPtrQ,
+                   void* devPtrK,
+                   void* devPtrV,
+                   void* devPtrO,
+                   cudnnDataType_t tensorType,
+                   int32_t nbDims,
+                   int64_t* q_stride,
+                   int64_t* k_stride,
+                   int64_t* s_stride,
+                   int64_t* v_stride,
+                   int64_t* o_stride) {
+    CUDNN_FRONTEND_UNUSED(nbDims);
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        auto qTensor = tensor_create(tensorType, 'q', q_dim, q_stride, false, false);
+        auto kTensor = tensor_create(tensorType, 'k', k_dim, k_stride, false, false);
+
+        // first GEMM output
+        auto sTensor = tensor_create(tensorType, 's', s_dim, s_stride, true, false);  // is virtual
+        auto vTensor = tensor_create(tensorType, 'v', v_dim, v_stride, false, false);
+
+        // second GEMM output
+        auto oTensor = tensor_create(tensorType, 'o', o_dim, o_stride, false, false);
+
+        // Define the matmul 1 desc
+        auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_1_Desc.describe() << std::endl;
+
+        // Create a matmul 1 Node
+        auto matmul_1_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                               .setaMatDesc(qTensor)
+                               .setbMatDesc(kTensor)
+                               .setcMatDesc(sTensor)
+                               .setmatmulDesc(matmul_1_Desc)
+                               .build();
+        std::cout << matmul_1_op.describe() << std::endl;
+
+        // Define the matmul 2 desc
+        auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_2_Desc.describe() << std::endl;
+
+        // Create a matmul 2 Node
+        auto matmul_2_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                               .setaMatDesc(sTensor)
+                               .setbMatDesc(vTensor)
+                               .setcMatDesc(oTensor)
+                               .setmatmulDesc(matmul_2_Desc)
+                               .build();
+        std::cout << matmul_2_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is gemm-gemm
+        std::array<cudnn_frontend::Operation const*, 2> ops = {&matmul_1_op, &matmul_2_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_instant"}, opGraph, ::allowAllConfig, filtered_configs, true);
+
+        if (filtered_configs.size() == 0) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_NOT_SUPPORTED, "run_b2b_batch_gemm: No config returned by the heuristics");
+        }
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+        void* data_ptrs[] = {devPtrQ, devPtrK, devPtrV, devPtrO};
+        int64_t uids[]    = {'q', 'k', 'v', 'o'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(4, data_ptrs)
+                               .setUids(4, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GA100 cards and GH100 cards
+        if (!((prop.major == 8 && prop.minor == 0) || (prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8800)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GA100 (cuDNN >= 8700) and GH100 (cuDNN >= 8800) GPUs"
+                      << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+
+static void
+createScale(int64_t b,
+            int64_t h,
+            int64_t s_q,
+            int64_t s_kv,
+            int64_t d,
+            MHA_Layout layout,
+            cudnnDataType_t tensorType,
+            std::vector<cudnn_frontend::Operation>& ops) {
+    // scale
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    int64_t k_dim[4] = {b, h, d, s_kv};
+    int64_t k_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix_Transpose);
+
+    auto scaleTensor = tensor_create(tensorType, S_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+    auto kTensor     = tensor_create(tensorType, K_ID, k_dim, k_stride, false, false);
+    auto afterScaleKTensor = tensor_create(tensorType, VIRTUAL_ID, k_dim, k_stride, true, false);  // is virtual
+
+    // Define the scale descriptor
+    auto scaleDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a Scale Node.
+    auto scale_op = binary_pw_op_create(kTensor, scaleTensor, afterScaleKTensor, scaleDesc);
+
+    ops.push_back(std::move(scale_op));
+}
+
+static cudnn_frontend::Tensor
+createBMM1(int64_t b,
+           int64_t h,
+           int64_t s_q,
+           int64_t s_kv,
+           int64_t d,
+           MHA_Layout layout,
+           cudnnDataType_t tensorType,
+           std::vector<cudnn_frontend::Operation>& ops) {
+    // Creates the necessary tensor descriptors
+    int64_t q_dim[4] = {b, h, s_q, d};
+    int64_t q_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+    int64_t k_dim[4] = {b, h, d, s_kv};
+    int64_t k_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix_Transpose);
+
+    int64_t p_dim[4] = {b, h, s_q, s_kv};
+    int64_t p_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, p_stride, layout, MHA_Matrix::S_Matrix);
+
+    int64_t seqlen_dim[4]    = {b, 1, 1, 1};
+    int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+    auto qTensor           = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false);
+    auto afterScaleKTensor = tensor_create(tensorType, VIRTUAL_ID, k_dim, k_stride, true, false);  // is virtual
+    // first GEMM output
+    auto pTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 1, p_dim, p_stride, true, false);  // is virtual
+
+    auto seqlenQTensor = tensor_create(CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+    auto seqlenKTensor = tensor_create(CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+
+    // Define the matmul 1 desc
+    auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+    std::cout << matmul_1_Desc.describe() << std::endl;
+
+    // Create a matmul 1 Node
+    auto matmul_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                          .setaMatDesc(qTensor)
+                          .setbMatDesc(afterScaleKTensor)
+                          .setcMatDesc(pTensor)
+                          .setmOverrideDesc(seqlenQTensor)
+                          .setnOverrideDesc(seqlenKTensor)
+                          .setmatmulDesc(matmul_1_Desc)
+                          .build();
+
+    std::cout << matmul_op1.describe() << std::endl;
+
+    ops.push_back(std::move(matmul_op1));
+
+    return pTensor;
+}
+
+static cudnn_frontend::Tensor
+createBias(int64_t b,
+           int64_t h,
+           int64_t s_q,
+           int64_t s_kv,
+           int64_t d,
+           MHA_Layout layout,
+           cudnnDataType_t tensorType,
+           std::vector<cudnn_frontend::Operation>& ops,
+           cudnn_frontend::Tensor& prevBlockOutputTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Bias op constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t b_dim[4]    = {1, h, s_q, s_kv};
+    int64_t b_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t afterBias_dim[4] = {b, h, s_q, s_kv};
+    int64_t afterBias_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, afterBias_stride, layout, MHA_Matrix::S_Matrix);
+
+    // bias
+    auto bTensor = tensor_create(tensorType, B_ID, b_dim, b_stride, false, false);
+    // output
+    auto afterBiasTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 50, afterBias_dim, afterBias_stride, true, false);  // is virtual
+
+    // Define the bias descriptor
+    auto biasDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_ADD);
+
+    // Create a Bias Node.
+    auto bias_op = binary_pw_op_create(prevBlockOutputTensor, bTensor, afterBiasTensor, biasDesc);
+
+    ops.push_back(std::move(bias_op));
+
+    return afterBiasTensor;
+}
+
+static cudnn_frontend::Tensor
+createMask(int64_t b,
+           int64_t h,
+           int64_t s_q,
+           int64_t s_kv,
+           int64_t d,
+           MHA_Layout layout,
+           bool is_causal_masking,
+           cudnnDataType_t tensorType,
+           std::vector<cudnn_frontend::Operation>& ops,
+           cudnn_frontend::Tensor& prevBlockOutputTensor,
+           bool is_bprop) {
+    CUDNN_FRONTEND_UNUSED(d);
+    CUDNN_FRONTEND_UNUSED(layout);
+    CUDNN_FRONTEND_UNUSED(tensorType);
+    CUDNN_FRONTEND_UNUSED(is_bprop);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Padding Mask constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    // subtraction output
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t seqlen_dim[4]    = {b, 1, 1, 1};
+    int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+    int64_t maskVal_dim[4]    = {1, 1, 1, 1};
+    int64_t maskVal_stride[4] = {1, 1, 1, 1};
+
+    // mask value to put in the masked pixels
+    auto maskValTensor =
+        tensor_create(CUDNN_DATA_FLOAT, MASK_VAL_ID, maskVal_dim, maskVal_stride, false, true);  // is by value
+
+    auto seqlenQTensor = tensor_create(CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+    auto seqlenKTensor = tensor_create(CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+    // gen index row output
+    auto rowIndexTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 100, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // gen index column output
+    auto columnIndexTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 101, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // less than row output
+    auto lessThanRowTensor = tensor_create(
+        CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 102, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+                                                                                              // less than column output
+    auto lessThanColTensor = tensor_create(
+        CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 103, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // padding mask (lessthanRow && lessthanCol)
+    auto paddingMaskTensor = tensor_create(
+        CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 104, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // row >= col check for causal mask
+    auto rowGreaterColTensor = tensor_create(
+        CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 105, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // create causal mask (padding && row >= col)
+    auto causalMaskTensor = tensor_create(
+        CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 106, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // output after masking
+    int64_t maskOutputTensor_id               = VIRTUAL_ID + 107;
+    int64_t maskOutputTensor_virtual          = true;
+    cudnnDataType_t maskOutputTensor_dataType = CUDNN_DATA_FLOAT;
+    auto maskOutputTensor_reorderType         = cudnn_frontend::TensorReordering_t::NONE;
+
+    if (is_bprop) {
+        maskOutputTensor_id          = dS_ID;
+        maskOutputTensor_virtual     = false;
+        maskOutputTensor_dataType    = tensorType;
+        maskOutputTensor_reorderType = cudnn_frontend::TensorReordering_t::F16x16;
+    }
+
+    auto maskOutputTensor = cudnn_frontend::TensorBuilder()
+                                .setDim(4, afterBMM1_dim)
+                                .setStride(4, afterBMM1_stride)
+                                .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                .setByValue(false)
+                                .setDataType(maskOutputTensor_dataType)
+                                .setVirtual(maskOutputTensor_virtual)
+                                .setId(maskOutputTensor_id)
+                                .setReorderType(maskOutputTensor_reorderType)
+                                .build();
+
+    // Define the gen index for row descriptor
+    auto genIndexRowDesc = cudnn_frontend::PointWiseDescBuilder()
+                               .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                               .setAxis(2)
+                               .setComputeType(CUDNN_DATA_FLOAT)
+                               .build();
+    std::cout << genIndexRowDesc.describe() << std::endl;
+
+    // Create a gen index Node.
+    auto genIndexRow_op = unary_pw_op_create(prevBlockOutputTensor, rowIndexTensor, genIndexRowDesc);
+    std::cout << genIndexRow_op.describe() << std::endl;
+
+    // Define the gen index for row descriptor
+    auto genIndexColumnDesc = cudnn_frontend::PointWiseDescBuilder()
+                                  .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                  .setAxis(3)
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .build();
+    std::cout << genIndexColumnDesc.describe() << std::endl;
+
+    // Create a gen index Node.
+    auto genIndexColumn_op = unary_pw_op_create(prevBlockOutputTensor, columnIndexTensor, genIndexColumnDesc);
+
+    // Define the less than comparison for row descriptor
+    auto lessThanRowDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_CMP_LT);
+
+    // Create a less than comparison for row Node.
+    auto lessThanRow_op = binary_pw_op_create(rowIndexTensor, seqlenQTensor, lessThanRowTensor, lessThanRowDesc);
+
+    // Define the less than comparison for column descriptor
+    auto lessThanColDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_CMP_LT);
+
+    // Create a less than comparison for col Node.
+    auto lessThanCol_op = binary_pw_op_create(columnIndexTensor, seqlenKTensor, lessThanColTensor, lessThanColDesc);
+
+    // Define the less than comparison for column descriptor
+    auto paddingMaskAndDesc = pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_LOGICAL_AND);
+
+    // Create a and node for combining lessThanRow and lessThanCol
+    auto paddingMaskAnd_op =
+        binary_pw_op_create(lessThanRowTensor, lessThanColTensor, paddingMaskTensor, paddingMaskAndDesc);
+
+    // Define the greater than equal to comparison descriptor
+    auto rowGreaterColDesc = pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_CMP_GE);
+
+    // Create a greater than equal to Node.
+    auto rowGreaterCol_op =
+        binary_pw_op_create(rowIndexTensor, columnIndexTensor, rowGreaterColTensor, rowGreaterColDesc);
+
+    // Define the and to create causal mask descriptor
+    auto causalMaskAndDesc = pw_desc_create(CUDNN_DATA_BOOLEAN, CUDNN_POINTWISE_LOGICAL_AND);
+
+    // Create a causal Mask Node.
+    auto causalMaskAnd_op =
+        binary_pw_op_create(paddingMaskTensor, rowGreaterColTensor, causalMaskTensor, causalMaskAndDesc);
+
+    /////////////////// Apply the mask //////////////////////////
+
+    auto maskTensor = (is_causal_masking) ? std::move(causalMaskTensor) : std::move(paddingMaskTensor);
+
+    // Define the binary select to perform masking descriptor
+    auto maskDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_BINARY_SELECT);
+
+    // Create a binary select Node.
+    auto mask_op = ternary_pw_op_create(prevBlockOutputTensor, maskValTensor, maskTensor, maskOutputTensor, maskDesc);
+
+    ops.push_back(std::move(genIndexRow_op));
+    ops.push_back(std::move(genIndexColumn_op));
+    ops.push_back(std::move(lessThanRow_op));
+    ops.push_back(std::move(lessThanCol_op));
+    ops.push_back(std::move(paddingMaskAnd_op));
+    if (is_causal_masking) ops.push_back(std::move(rowGreaterCol_op));
+    if (is_causal_masking) ops.push_back(std::move(causalMaskAnd_op));
+    ops.push_back(std::move(mask_op));
+
+    return maskOutputTensor;
+}
+
+static cudnn_frontend::Tensor
+createSoftmaxForward(int64_t b,
+                     int64_t h,
+                     int64_t s_q,
+                     int64_t s_kv,
+                     int64_t d,
+                     MHA_Layout layout,
+                     bool enable_dropout,
+                     bool softmax_output_virtual,
+                     cudnnDataType_t tensorType,
+                     std::vector<cudnn_frontend::Operation>& ops,
+                     cudnn_frontend::Tensor& prevBlockOutputTensor) {
+    CUDNN_FRONTEND_UNUSED(d);
+    CUDNN_FRONTEND_UNUSED(layout);
+
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t afterReduction_dim[4]    = {b, h, s_q, 1};
+    int64_t afterReduction_stride[4] = {h * s_q, s_q, 1, 1};
+
+    cudnnDataType_t softmaxOutputType = (enable_dropout || softmax_output_virtual) ? CUDNN_DATA_FLOAT : tensorType;
+    uint64_t softmaxOutputName        = softmax_output_virtual ? VIRTUAL_ID + 154 : S_ID;
+
+    // max (x)
+    auto afterMaxReductionTensor = tensor_create(
+        CUDNN_DATA_FLOAT, VIRTUAL_ID + 150, afterReduction_dim, afterReduction_stride, true, false);  // is virtual
+    // x - max(x)
+    auto afterSubtractionTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 151, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // e^(x - max(x))
+    auto afterExponentTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 152, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual;
+    // sum (e^(x - max(x)))
+    auto afterAddReductionTensor = tensor_create(
+        CUDNN_DATA_FLOAT, VIRTUAL_ID + 153, afterReduction_dim, afterReduction_stride, true, false);  // is virtual
+    // divide (e/ sum(e))
+
+    auto afterDivisionTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, afterBMM1_dim)
+                                   .setStride(4, afterBMM1_stride)
+                                   .setId(softmaxOutputName)
+                                   .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                   .setDataType(softmaxOutputType)
+                                   .setVirtual(softmax_output_virtual)
+                                   .setByValue(false)
+                                   .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                   .build();
+
+    // Define the reduction descriptor
+    auto reductionMaxDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_MAX)
+                                .build();
+    std::cout << reductionMaxDesc.describe() << std::endl;
+
+    // Create a reduction max Node.
+    auto reductionMax_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(prevBlockOutputTensor)
+                               .setyDesc(afterMaxReductionTensor)
+                               .setreductionDesc(reductionMaxDesc)
+                               .build();
+    std::cout << reductionMax_op.describe() << std::endl;
+
+    // Define the subtract descriptor
+    auto subtractDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+
+    // Create a subtract Node.
+    auto subtract_op =
+        binary_pw_op_create(prevBlockOutputTensor, afterMaxReductionTensor, afterSubtractionTensor, subtractDesc);
+
+    // Define the exponent descriptor
+    auto exponentDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_EXP);
+
+    // Create a exponent Node.
+    auto exponent_op = unary_pw_op_create(afterSubtractionTensor, afterExponentTensor, exponentDesc);
+
+    // Define the reduction descriptor
+    auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+    std::cout << reductionAddDesc.describe() << std::endl;
+
+    // Create a reduction add Node.
+    auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(afterExponentTensor)
+                               .setyDesc(afterAddReductionTensor)
+                               .setreductionDesc(reductionAddDesc)
+                               .build();
+
+    std::cout << reductionAdd_op.describe() << std::endl;
+
+    // Define the division descriptor
+    auto divisionDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_DIV);
+
+    // Create a subtract Node.
+    auto division_op =
+        binary_pw_op_create(afterExponentTensor, afterAddReductionTensor, afterDivisionTensor, divisionDesc);
+
+    ops.push_back(std::move(reductionMax_op));
+    ops.push_back(std::move(subtract_op));
+    ops.push_back(std::move(exponent_op));
+    ops.push_back(std::move(reductionAdd_op));
+    ops.push_back(std::move(division_op));
+
+    return afterDivisionTensor;
+}
+
+static cudnn_frontend::Tensor
+createDropout(int64_t b,
+              int64_t h,
+              int64_t s_q,
+              int64_t s_kv,
+              int64_t d,
+              int64_t seed,
+              double probability,
+              cudnnDataType_t tensorType,
+              std::vector<cudnn_frontend::Operation>& ops,
+              cudnn_frontend::Tensor& prevBlockOutputTensor) {
+    CUDNN_FRONTEND_UNUSED(d);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Dropout DAG constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t afterBMM1_dim[4]    = {b, h, s_q, s_kv};
+    int64_t afterBMM1_stride[4] = {h * s_q * s_kv, s_q * s_kv, s_kv, 1};
+
+    int64_t scale_dim[4]    = {1, 1, 1, 1};
+    int64_t scale_stride[4] = {1, 1, 1, 1};
+
+    // mask for the dropout
+    auto dropoutMaskTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 200, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+    // after dropout tensor
+    auto afterDropoutTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, afterBMM1_dim)
+                                  .setStride(4, afterBMM1_stride)
+                                  .setId(S_ID)
+                                  .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                  .setDataType(tensorType)
+                                  .setVirtual(false)
+                                  .setByValue(false)
+                                  .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                                  .build();
+    // scale after dropout
+    auto scaleDropoutTensor =
+        tensor_create(tensorType, D_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+    // after Scale
+    auto afterScaleTensor =
+        tensor_create(tensorType, VIRTUAL_ID + 201, afterBMM1_dim, afterBMM1_stride, true, false);  // is virtual
+
+    // Define the reduction descriptor
+    auto rngDesc = cudnn_frontend::RngDescBuilder()
+                       .setRngDistribution(CUDNN_RNG_DISTRIBUTION_BERNOULLI)
+                       .setBernoulliDistProbability(1.0 - probability)
+                       .build();
+    std::cout << rngDesc.describe() << std::endl;
+
+    // Create a rng Node.
+    auto rng_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR)
+                      .setyDesc(dropoutMaskTensor)
+                      .setSeed(seed)
+                      .setRngDesc(rngDesc)
+                      .build();
+
+    std::cout << rng_op.describe() << std::endl;
+
+    // Define the multiply mask descriptor
+    auto maskMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto maskMul_op = binary_pw_op_create(prevBlockOutputTensor, dropoutMaskTensor, afterDropoutTensor, maskMulDesc);
+
+    // Define the multiply scale descriptor
+    auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+
+    // Create a multiply mask Node.
+    auto scaleMul_op = binary_pw_op_create(afterDropoutTensor, scaleDropoutTensor, afterScaleTensor, scaleMulDesc);
+
+    ops.push_back(std::move(rng_op));
+    ops.push_back(std::move(maskMul_op));
+    ops.push_back(std::move(scaleMul_op));
+
+    return afterScaleTensor;
+}
+
+static void
+createBMM2(int64_t b,
+           int64_t h,
+           int64_t s_q,
+           int64_t s_kv,
+           int64_t d,
+           MHA_Layout layout,
+           cudnnDataType_t tensorType,
+           std::vector<cudnn_frontend::Operation>& ops,
+           cudnn_frontend::Tensor const& prevBlockOutputTensor) {
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "BMM2 op constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t seqlen_dim[4]    = {b, 1, 1, 1};
+    int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+    int64_t v_dim[4] = {b, h, s_kv, d};
+    int64_t v_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, v_stride, layout, MHA_Matrix::V_Matrix);
+
+    int64_t o_dim[4] = {b, h, s_q, d};
+    int64_t o_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+    auto seqlenQTensor = tensor_create(CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+    auto seqlenKTensor = tensor_create(CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+    auto vTensor       = tensor_create(tensorType, V_ID, v_dim, v_stride, false, false);
+    // second GEMM output
+    auto oTensor = tensor_create(tensorType, O_ID, o_dim, o_stride, false, false);
+
+    // Define the matmul 2 desc
+    auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+    std::cout << matmul_2_Desc.describe() << std::endl;
+
+    // Create a matmul 2 Node
+    auto matmul_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                          .setaMatDesc(prevBlockOutputTensor)
+                          .setbMatDesc(vTensor)
+                          .setcMatDesc(oTensor)
+                          .setmOverrideDesc(seqlenQTensor)
+                          .setkOverrideDesc(seqlenKTensor)
+                          .setmatmulDesc(matmul_2_Desc)
+                          .build();
+
+    std::cout << matmul_op2.describe() << std::endl;
+
+    ops.push_back(std::move(matmul_op2));
+}
+
+static cudnn_frontend::Tensor
+createSoftmaxBackward(int64_t b,
+                      int64_t h,
+                      int64_t s_q,
+                      int64_t s_kv,
+                      int64_t d,
+                      MHA_Layout layout,
+                      cudnnDataType_t tensorType,
+                      std::vector<cudnn_frontend::Operation>& ops,
+                      cudnn_frontend::Tensor& yTensor,
+                      cudnn_frontend::Tensor& dyTensor) {
+    CUDNN_FRONTEND_UNUSED(tensorType);
+
+    cudnn_frontend::throw_if(
+        ops.size() == 0, "Softmax backward constructed incorrectly as the first one", CUDNN_STATUS_BAD_PARAM);
+
+    int64_t p_dim[4] = {b, h, s_q, s_kv};
+    int64_t p_stride[4];
+    generateMHAStrides(b, h, s_q, s_kv, d, p_stride, layout, MHA_Matrix::S_Matrix);
+
+    int64_t p_reduction_dim[4] = {b, h, s_q, 1};
+    int64_t p_reduction_stride[4];
+
+    p_reduction_stride[3] = 1;
+    p_reduction_stride[2] = 1;
+    p_reduction_stride[1] = s_q;
+    p_reduction_stride[0] = s_q * h;
+
+    int64_t const_dim[4]    = {1, 1, 1, 1};
+    int64_t const_stride[4] = {1, 1, 1, 1};
+
+    // creating all tensors
+    auto softmaxScaleTensor = tensor_create(CUDNN_DATA_FLOAT, S_CONST_ID, const_dim, const_stride, false, true);
+    auto dyMulYTensor       = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 250, p_dim, p_stride, true, false);
+    auto dxAfterReductionTensor =
+        tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 251, p_reduction_dim, p_reduction_stride, true, false);
+    auto dxAfterSubtractionTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 252, p_dim, p_stride, true, false);
+    auto dxUnscaleTensor          = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 253, p_dim, p_stride, true, false);
+    auto dxTensor                 = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 254, p_dim, p_stride, true, false);
+
+    // creating all ops
+    // mul (y * dy)
+    auto mul_1_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+    auto mul_1_op   = binary_pw_op_create(yTensor, dyTensor, dyMulYTensor, mul_1_desc);
+
+    // reduction add sum (y * dy)
+    auto reductionAddDesc = cudnn_frontend::ReductionDescBuilder()
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                .build();
+    std::cout << reductionAddDesc.describe() << std::endl;
+
+    auto reductionAdd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                               .setxDesc(dyMulYTensor)
+                               .setyDesc(dxAfterReductionTensor)
+                               .setreductionDesc(reductionAddDesc)
+                               .build();
+
+    std::cout << reductionAdd_op.describe() << std::endl;
+
+    // subtraction (dy - sum(y * dy))
+    auto sub_0_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_SUB);
+    auto sub_0_op   = binary_pw_op_create(dyTensor, dxAfterReductionTensor, dxAfterSubtractionTensor, sub_0_desc);
+
+    // mul (y * (dy - sum(y * dy)))
+    auto mul_2_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+    auto mul_2_op   = binary_pw_op_create(yTensor, dxAfterSubtractionTensor, dxUnscaleTensor, mul_2_desc);
+
+    // mul (scale * dx)
+    auto mul_3_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+    auto mul_3_op   = binary_pw_op_create(dxUnscaleTensor, softmaxScaleTensor, dxTensor, mul_3_desc);
+
+    ops.push_back(std::move(mul_1_op));
+    ops.push_back(std::move(reductionAdd_op));
+    ops.push_back(std::move(sub_0_op));
+    ops.push_back(std::move(mul_2_op));
+    ops.push_back(std::move(mul_3_op));
+
+    return dxTensor;
+}
+
+cudnnStatus_t
+execute_cached_plan(cudnnHandle_t handle,
+                    cudnn_frontend::ExecutionPlanCache& plan_cache,
+                    cudnn_frontend::OperationGraph& opGraph,
+                    std::set<std::pair<uint64_t, void*>>& data_ptrs) {
+    cudnnBackendDescriptor_t raw_plan;
+    int64_t workspace_size = 0;
+
+    cudnn_frontend::ExecutionPlan const* cached_plan;
+    if (plan_cache.get_plan_from_cache(opGraph, cached_plan)) {
+        std::cout << "Cached execution plan found." << cached_plan->getTag() << std::endl;
+        workspace_size = cached_plan->getWorkspaceSize();
+        raw_plan       = cached_plan->get_raw_desc();
+    } else {
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_instant"}, opGraph, ::allowAllConfig, filtered_configs, true);
+
+        if (filtered_configs.size() == 0) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_NOT_SUPPORTED, "run_mha_fprop: No config returned by the heuristics");
+        }
+
+        auto plan_ = cudnn_frontend::ExecutionPlanBuilder()
+                         .setHandle(handle)
+                         .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                         .build();
+        plan_cache.add_plan_to_cache(opGraph, plan_);
+        workspace_size = plan_.getWorkspaceSize();
+        raw_plan       = plan_.get_raw_desc();
+    }
+
+    void* workspace_ptr = nullptr;
+    if (workspace_size > 0) {
+        checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+    }
+
+    auto variantPack =
+        cudnn_frontend::VariantPackBuilder().setWorkspacePointer(workspace_ptr).setDataPointers(data_ptrs).build();
+
+    std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+    cudnnStatus_t status = cudnnBackendExecute(handle, raw_plan, variantPack.get_raw_desc());
+
+    if (workspace_size > 0) {
+        checkCudaErr(cudaFree(workspace_ptr));
+    }
+
+    cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    return status;
+}
+
+void
+run_mha_fprop(int64_t b,
+              int64_t h,
+              int64_t s_q,
+              int64_t s_kv,
+              int64_t d,
+              int64_t seed,
+              MHA_Layout layout,
+              half1 scaling_factor,
+              double dropout_probability,
+              MHA_Bias_Type bias_type,
+              bool is_causal_masking,
+              void* devPtrQ,
+              void* devPtrK,
+              void* devPtrV,
+              void* devPtrS,
+              void* devPtrO,
+              void* devPtrBias,
+              void* devActualSeqlenQ,
+              void* devActualSeqlenK,
+              cudnnDataType_t tensorType) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        std::vector<cudnn_frontend::Operation const*> all_ops;
+        std::vector<cudnn_frontend::Operation> ops;
+        std::set<std::pair<uint64_t, void*>> data_ptrs;
+
+        createScale(b, h, s_q, s_kv, d, layout, tensorType, ops);
+
+        std::shared_ptr<cudnn_frontend::Tensor> maskInput;
+
+        auto bmm1_output = createBMM1(b, h, s_q, s_kv, d, layout, tensorType, ops);
+
+        if (bias_type != MHA_Bias_Type::NO_BIAS) {
+            auto bias_output = createBias(b, h, s_q, s_kv, d, layout, tensorType, ops, bmm1_output);
+            maskInput        = std::make_shared<cudnn_frontend::Tensor>(std::move(bias_output));
+        } else {
+            maskInput = std::make_shared<cudnn_frontend::Tensor>(std::move(bmm1_output));
+        }
+
+        float negInfinity = -1.0E+20f;  // change this if you have access to float_min
+        auto mask_output =
+            createMask(b, h, s_q, s_kv, d, layout, is_causal_masking, tensorType, ops, *maskInput.get(), false);
+
+        bool enable_dropout = (dropout_probability != 0.0f);
+        cudnn_frontend::throw_if(
+            dropout_probability == 1.0f, "Dropout probability cannot be 1.0", CUDNN_STATUS_BAD_PARAM);
+
+        // needs to be bf16 (Please change)
+        half1 scale_dropout = cpu_float2half_rn(static_cast<float>(1 / (1 - dropout_probability)));
+
+        bool softmax_output_virtual = enable_dropout || devPtrS == nullptr;
+        auto softmax_output         = createSoftmaxForward(
+            b, h, s_q, s_kv, d, layout, enable_dropout, softmax_output_virtual, tensorType, ops, mask_output);
+
+        if (dropout_probability != 0.0f) {
+            auto dropout_output =
+                createDropout(b, h, s_q, s_kv, d, seed, dropout_probability, tensorType, ops, softmax_output);
+            createBMM2(b, h, s_q, s_kv, d, layout, tensorType, ops, dropout_output);
+        } else {
+            createBMM2(b, h, s_q, s_kv, d, layout, tensorType, ops, softmax_output);
+        }
+
+        std::cout << "Total ops created: " << ops.size() << std::endl;
+
+        for (unsigned int i = 0; i < ops.size(); i++) {
+            all_ops.push_back(&ops[i]);
+        }
+
+        // Create an Operation Graph
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(all_ops.size(), all_ops.data())
+                           .build();
+
+        // {b, h, s_q, s_kv, d, seed, layout(enum class, should be int), bias_type(enum class, should be int),
+        // is_causal_masking(bool), tensorType(cudnnDataType_t)}
+        opGraph.setFeatureVector({b,
+                                  h,
+                                  s_q,
+                                  s_kv,
+                                  d,
+                                  seed,
+                                  static_cast<int64_t>(layout),
+                                  static_cast<int64_t>(bias_type),
+                                  static_cast<int64_t>(is_causal_masking),
+                                  static_cast<int64_t>(tensorType)});
+
+        // add all the data pointers to be used in the variant pack
+        data_ptrs.insert(std::pair<uint64_t, void*>(Q_ID, devPtrQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(K_ID, devPtrK));
+        data_ptrs.insert(std::pair<uint64_t, void*>(V_ID, devPtrV));
+        data_ptrs.insert(std::pair<uint64_t, void*>(Q_SEQLEN_ID, devActualSeqlenQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(K_SEQLEN_ID, devActualSeqlenK));
+        data_ptrs.insert(std::pair<uint64_t, void*>(MASK_VAL_ID, &negInfinity));
+        data_ptrs.insert(std::pair<uint64_t, void*>(S_CONST_ID, &scaling_factor));
+        data_ptrs.insert(std::pair<uint64_t, void*>(O_ID, devPtrO));
+
+        if (bias_type != MHA_Bias_Type::NO_BIAS) {
+            data_ptrs.insert(std::pair<uint64_t, void*>(B_ID, devPtrBias));
+        }
+
+        if (devPtrS != nullptr) {
+            data_ptrs.insert(std::pair<uint64_t, void*>(S_ID, devPtrS));
+        }
+
+        if (enable_dropout) {
+            data_ptrs.insert(std::pair<uint64_t, void*>(D_CONST_ID, &scale_dropout));
+        }
+
+        cudnn_frontend::ExecutionPlanCache plan_cache("mha_fprop_cache");
+
+        execute_cached_plan(handle_, plan_cache, opGraph, data_ptrs);
+
+        execute_cached_plan(handle_, plan_cache, opGraph, data_ptrs);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GA100 cards (cudnn Version >= 8700) and GH100 cards (cudnn Version >= 8800)
+        if (!((prop.major == 8 && prop.minor == 0) || (prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8800)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GA100 (cuDNN >= 8700) and GH100 (cuDNN >= 8800) GPUs"
+                      << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+
+void
+run_mha_bprop(int64_t b,
+              int64_t h,
+              int64_t s_q,
+              int64_t s_kv,
+              int64_t d,
+              MHA_Layout layout,
+              float scaling_factor,
+              float dropout_probability,
+              bool is_causal_masking,
+              void* devPtrQ,
+              void* devPtrK,
+              void* devPtrV,
+              void* devPtrS,
+              void* devPtrdQ,
+              void* devPtrdK,
+              void* devPtrdV,
+              void* devPtrdO,
+              void* devPtrdS,
+              void* devActualSeqlenQ,
+              void* devActualSeqlenK,
+              cudnnDataType_t tensorType) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        std::vector<cudnn_frontend::Operation const*> all_ops;
+        std::vector<cudnn_frontend::Operation> ops;
+        std::set<std::pair<uint64_t, void*>> data_ptrs;
+
+        // Creates the necessary tensor descriptors
+        int64_t q_dim[4] = {b, h, s_q, d};
+        int64_t q_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, q_stride, layout, MHA_Matrix::Q_Matrix);
+
+        int64_t k_dim[4] = {b, h, s_kv, d};
+        int64_t k_stride[4];
+        generateMHAStrides(
+            b, h, s_q, s_kv, d, k_stride, layout, MHA_Matrix::K_Matrix);  // type is correct as K is not transposed
+
+        int64_t v_dim[4] = {b, h, d, s_kv};
+        int64_t v_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, v_stride, layout, MHA_Matrix::V_Matrix_Transpose);  // type is correct as
+                                                                                                   // V is transposed
+
+        int64_t p_dim[4] = {b, h, s_q, s_kv};
+        int64_t p_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, p_stride, layout, MHA_Matrix::S_Matrix);
+
+        int64_t p_transpose_dim[4] = {b, h, s_kv, s_q};
+        int64_t p_transpose_stride[4];
+        p_transpose_stride[0] = p_stride[0];
+        p_transpose_stride[1] = p_stride[1];
+        p_transpose_stride[2] = p_stride[3];
+        p_transpose_stride[3] = p_stride[2];
+
+        int64_t o_dim[4] = {b, h, s_q, d};
+        int64_t o_stride[4];
+        generateMHAStrides(b, h, s_q, s_kv, d, o_stride, layout, MHA_Matrix::O_Matrix);
+
+        int64_t seqlen_dim[4]    = {b, 1, 1, 1};
+        int64_t seqlen_stride[4] = {1, 1, 1, 1};
+
+        int64_t scale_dim[4]    = {1, 1, 1, 1};
+        int64_t scale_stride[4] = {1, 1, 1, 1};
+
+        // inputs to fprop
+        auto qTensor       = tensor_create(tensorType, Q_ID, q_dim, q_stride, false, false);
+        auto kTensor       = tensor_create(tensorType, K_ID, k_dim, k_stride, false, false);
+        auto vTensor       = tensor_create(tensorType, V_ID, v_dim, v_stride, false, false);
+        auto seqlenQTensor = tensor_create(CUDNN_DATA_INT32, Q_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+        auto seqlenKTensor = tensor_create(CUDNN_DATA_INT32, K_SEQLEN_ID, seqlen_dim, seqlen_stride, false, false);
+
+        // gradient of the output
+        auto doTensor = tensor_create(tensorType, dO_ID, o_dim, o_stride, false, false);
+
+        // activation from fprop
+        auto pTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, p_dim)
+                           .setStride(4, p_stride)
+                           .setId(S_ID)
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(tensorType)
+                           .setVirtual(false)
+                           .setByValue(false)
+                           .setReorderType(cudnn_frontend::TensorReordering_t::F16x16)
+                           .build();
+
+        // outputs from bprop
+        auto dqTensor = tensor_create(tensorType, dQ_ID, q_dim, q_stride, false, false);
+        auto dkTensor = tensor_create(tensorType, dK_ID, k_dim, k_stride, false, false);
+        auto dvTensor = tensor_create(
+            tensorType, dV_ID, k_dim, k_stride, false, false);  // not transposed therefore k_dim and k_stride
+
+        ////////////////////////////////////////////////////////
+        // start creating the ops and the intermediate tensors
+        auto pReshapeTensor =
+            tensor_create(tensorType, VIRTUAL_ID + 300, p_transpose_dim, p_transpose_stride, true, false);
+
+        // reshape to perform transpose and make pReshape
+        auto reshape_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                              .setxDesc(pTensor)
+                              .setyDesc(pReshapeTensor)
+                              .build();
+
+        std::cout << reshape_op.describe() << std::endl;
+        ops.push_back(std::move(reshape_op));
+
+        // scale dropout
+        auto dropoutScaleTensor =
+            tensor_create(CUDNN_DATA_FLOAT, D_CONST_ID, scale_dim, scale_stride, false, true);  // is by value
+        auto pAfterScaleTensor =
+            tensor_create(tensorType, VIRTUAL_ID + 301, p_transpose_dim, p_transpose_stride, true, false);
+
+        auto scaleMulDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+        auto scaleMul_op  = binary_pw_op_create(pReshapeTensor, dropoutScaleTensor, pAfterScaleTensor, scaleMulDesc);
+        ops.push_back(std::move(scaleMul_op));
+
+        // perform absolute operation to remove the mask bit
+        auto pTransposeAfterAbsTensor =
+            tensor_create(tensorType, VIRTUAL_ID + 302, p_transpose_dim, p_transpose_stride, true, false);
+
+        auto absDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_ABS);
+        auto abs_op  = unary_pw_op_create(pAfterScaleTensor, pTransposeAfterAbsTensor, absDesc);
+        ops.push_back(std::move(abs_op));
+
+        // matmul to calculate dvTensor
+        auto matmul_0_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_0_Desc.describe() << std::endl;
+
+        auto matmul_op0 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(pTransposeAfterAbsTensor)
+                              .setbMatDesc(doTensor)
+                              .setcMatDesc(dvTensor)
+                              .setmOverrideDesc(seqlenKTensor)
+                              .setkOverrideDesc(seqlenQTensor)
+                              .setmatmulDesc(matmul_0_Desc)
+                              .build();
+
+        std::cout << matmul_op0.describe() << std::endl;
+
+        ops.push_back(std::move(matmul_op0));
+
+        // matmul to calculate dpTensor
+        auto dpTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 303, p_dim, p_stride, true, false);
+
+        auto matmul_1_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_1_Desc.describe() << std::endl;
+
+        auto matmul_op1 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(doTensor)
+                              .setbMatDesc(vTensor)
+                              .setcMatDesc(dpTensor)
+                              .setmOverrideDesc(seqlenQTensor)
+                              .setnOverrideDesc(seqlenKTensor)
+                              .setmatmulDesc(matmul_1_Desc)
+                              .build();
+
+        std::cout << matmul_op1.describe() << std::endl;
+
+        ops.push_back(std::move(matmul_op1));
+
+        // mask the values which were dropped in dropout
+        auto pAbsTensor = tensor_create(tensorType, VIRTUAL_ID + 304, p_dim, p_stride, true, false);
+
+        auto p_absDesc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_ABS);
+        auto p_abs_op  = unary_pw_op_create(pTensor, pAbsTensor, p_absDesc);
+        ops.push_back(std::move(p_abs_op));
+
+        // create the dropout mask
+        auto zeroTensor =
+            tensor_create(CUDNN_DATA_FLOAT, MASK_VAL_ID, scale_dim, scale_stride, false, true);  // is by value
+        auto dropoutMaskTensor = tensor_create(CUDNN_DATA_BOOLEAN, VIRTUAL_ID + 305, p_dim, p_stride, true, false);
+
+        auto greater_than_0_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_CMP_GT);
+        auto greater_than_0_op   = binary_pw_op_create(pTensor, zeroTensor, dropoutMaskTensor, greater_than_0_desc);
+        ops.push_back(std::move(greater_than_0_op));
+
+        // scale for the dropout
+        auto dpAfterScaleTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 306, p_dim, p_stride, true, false);
+
+        auto mul_0_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_MUL);
+        auto mul_0_op   = binary_pw_op_create(dpTensor, dropoutScaleTensor, dpAfterScaleTensor, mul_0_desc);
+        ops.push_back(std::move(mul_0_op));
+
+        // drop the values based on the dropout mask
+        auto dpAfterDropoutTensor = tensor_create(CUDNN_DATA_FLOAT, VIRTUAL_ID + 307, p_dim, p_stride, true, false);
+
+        auto selection_0_desc = pw_desc_create(CUDNN_DATA_FLOAT, CUDNN_POINTWISE_BINARY_SELECT);
+        auto selection_0_op   = ternary_pw_op_create(
+            dpAfterScaleTensor, zeroTensor, dropoutMaskTensor, dpAfterDropoutTensor, selection_0_desc);
+        ops.push_back(std::move(selection_0_op));
+
+        // softmax backward
+        auto dsTensor =
+            createSoftmaxBackward(b, h, s_q, s_kv, d, layout, tensorType, ops, pAbsTensor, dpAfterDropoutTensor);
+
+        // mask
+        auto dsAfterMaskTensor =
+            createMask(b, h, s_q, s_kv, d, layout, is_causal_masking, tensorType, ops, dsTensor, true);
+
+        // matmul to calculate dqTensor
+        auto matmul_2_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_2_Desc.describe() << std::endl;
+
+        auto matmul_op2 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(dsAfterMaskTensor)
+                              .setbMatDesc(kTensor)
+                              .setcMatDesc(dqTensor)
+                              .setmOverrideDesc(seqlenQTensor)
+                              .setkOverrideDesc(seqlenKTensor)
+                              .setmatmulDesc(matmul_2_Desc)
+                              .build();
+
+        std::cout << matmul_op2.describe() << std::endl;
+
+        ops.push_back(std::move(matmul_op2));
+
+        // reshape for transpose of ds
+        auto dsAfterMaskReshapeTensor =
+            tensor_create(tensorType, VIRTUAL_ID + 308, p_transpose_dim, p_transpose_stride, true, false);
+
+        auto reshape_2_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR)
+                                .setxDesc(dsAfterMaskTensor)
+                                .setyDesc(dsAfterMaskReshapeTensor)
+                                .build();
+
+        std::cout << reshape_2_op.describe() << std::endl;
+        ops.push_back(std::move(reshape_2_op));
+
+        // matmul to calculate dkTensor
+        auto matmul_3_Desc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmul_3_Desc.describe() << std::endl;
+
+        auto matmul_op3 = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                              .setaMatDesc(dsAfterMaskReshapeTensor)
+                              .setbMatDesc(qTensor)
+                              .setcMatDesc(dkTensor)
+                              .setmOverrideDesc(seqlenKTensor)
+                              .setkOverrideDesc(seqlenQTensor)
+                              .setmatmulDesc(matmul_3_Desc)
+                              .build();
+
+        std::cout << matmul_op3.describe() << std::endl;
+
+        ops.push_back(std::move(matmul_op3));
+
+        /////////////////////////////////////////////////////////////////
+
+        std::cout << "Total ops created: " << ops.size() << std::endl;
+
+        for (unsigned int i = 0; i < ops.size(); i++) {
+            all_ops.push_back(&ops[i]);
+        }
+
+        // Create an Operation Graph
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(all_ops.size(), all_ops.data())
+                           .build();
+
+        // {b, h, s_q, s_kv, d, seed, layout(enum class, should be int), bias_type(enum class, should be int),
+        // is_causal_masking(bool), tensorType(cudnnDataType_t)}
+        opGraph.setFeatureVector({b,
+                                  h,
+                                  s_q,
+                                  s_kv,
+                                  d,
+                                  static_cast<int64_t>(0),
+                                  static_cast<int64_t>(layout),
+                                  static_cast<int64_t>(0),
+                                  static_cast<int64_t>(is_causal_masking),
+                                  static_cast<int64_t>(tensorType)});
+
+        // add all the data pointers to be used in the variant pack
+        data_ptrs.insert(std::pair<uint64_t, void*>(dqTensor.getId(), devPtrdQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dkTensor.getId(), devPtrdK));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dvTensor.getId(), devPtrdV));
+
+        data_ptrs.insert(std::pair<uint64_t, void*>(qTensor.getId(), devPtrQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(kTensor.getId(), devPtrK));
+        data_ptrs.insert(std::pair<uint64_t, void*>(vTensor.getId(), devPtrV));
+        data_ptrs.insert(std::pair<uint64_t, void*>(pTensor.getId(), devPtrS));
+        data_ptrs.insert(std::pair<uint64_t, void*>(doTensor.getId(), devPtrdO));
+        data_ptrs.insert(std::pair<uint64_t, void*>(dsAfterMaskTensor.getId(), devPtrdS));
+        data_ptrs.insert(std::pair<uint64_t, void*>(seqlenQTensor.getId(), devActualSeqlenQ));
+        data_ptrs.insert(std::pair<uint64_t, void*>(seqlenKTensor.getId(), devActualSeqlenK));
+
+        float zeroVal      = 0.0f;
+        float dropoutScale = 1.0f / (1.0f - dropout_probability);
+
+        data_ptrs.insert(std::pair<uint64_t, void*>(dropoutScaleTensor.getId(), &dropoutScale));
+        data_ptrs.insert(std::pair<uint64_t, void*>(S_CONST_ID, &scaling_factor));
+        data_ptrs.insert(std::pair<uint64_t, void*>(zeroTensor.getId(), &zeroVal));
+
+        cudnn_frontend::ExecutionPlanCache plan_cache("mha_bprop_cache");
+
+        execute_cached_plan(handle_, plan_cache, opGraph, data_ptrs);
+
+        execute_cached_plan(handle_, plan_cache, opGraph, data_ptrs);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for GA100 cards and GH100 cards
+        if (!((prop.major == 8 && prop.minor == 0) || (prop.major == 9 && prop.minor == 0 && CUDNN_VERSION >= 8800)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for GA100 (cuDNN >= 8700) and GH100 (cuDNN >= 8800) GPUs"
+                      << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+#endif
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fused_mha_sample.h b/third_party/cudnn-frontend/samples/legacy_samples/fused_mha_sample.h
new file mode 100644
index 00000000..e8fc5695
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fused_mha_sample.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn.h>
+#include "./utils/fp16_dev.h"
+#include "./utils/fp16_emu.h"
+#include "./utils/helpers.h"
+
+#if (CUDNN_VERSION >= 8700)
+void
+run_b2b_batch_gemm(int64_t* q_dim,
+                   int64_t* k_dim,
+                   int64_t* s_dim,
+                   int64_t* v_dim,
+                   int64_t* o_dim,
+                   void* devPtrQ,
+                   void* devPtrK,
+                   void* devPtrV,
+                   void* devPtrO,
+                   cudnnDataType_t tensorType,
+                   int32_t nbDims,
+                   int64_t* q_stride,
+                   int64_t* k_stride,
+                   int64_t* s_stride,
+                   int64_t* v_stride,
+                   int64_t* o_stride);
+
+void
+run_mha_fprop(int64_t b,
+              int64_t h,
+              int64_t s_q,
+              int64_t s_kv,
+              int64_t d,
+              int64_t seed,
+              MHA_Layout layout,
+              half1 scaling_factor,
+              double dropout_probability,
+              MHA_Bias_Type bias_type,
+              bool is_causal_masking,
+              void* devPtrQ,
+              void* devPtrK,
+              void* devPtrV,
+              void* devPtrS,
+              void* devPtrO,
+              void* devPtrBias,
+              void* devActualSeqlenQ,
+              void* devActualSeqlenK,
+              cudnnDataType_t tensorType);
+
+void
+run_mha_bprop(int64_t b,
+              int64_t h,
+              int64_t s_q,
+              int64_t s_kv,
+              int64_t d,
+              MHA_Layout layout,
+              float scaling_factor,
+              float dropout_probability,
+              bool is_causal_masking,
+              void* devPtrQ,
+              void* devPtrK,
+              void* devPtrV,
+              void* devPtrS,
+              void* devPtrdQ,
+              void* devPtrdK,
+              void* devPtrdV,
+              void* devPtrdO,
+              void* devPtrdS,
+              void* devActualSeqlenQ,
+              void* devActualSeqlenK,
+              cudnnDataType_t tensorType);
+
+#endif
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fusion_sample.cpp b/third_party/cudnn-frontend/samples/legacy_samples/fusion_sample.cpp
new file mode 100644
index 00000000..b0d1283d
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fusion_sample.cpp
@@ -0,0 +1,4126 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "fusion_sample.h"
+#include <cudnn_frontend.h>
+#include "./utils/error_util.h"
+
+bool
+allowAll(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+#if (CUDNN_VERSION >= 8200)
+bool
+isRuntimeCompilation(cudnnBackendDescriptor_t engine_config) {
+    return cudnn_frontend::hasBehaviorNote<CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION>(engine_config);
+}
+#endif
+
+cudnn_frontend::ExecutionPlan
+get_execplan_from_heuristics_else_fall_back(cudnn_frontend::OperationGraph&& opGraph, cudnnHandle_t handle_) {
+#if (CUDNN_VERSION >= 8200)
+    {
+        auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+                              .setOperationGraph(opGraph)
+                              .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+                              .build();
+
+        std::cout << "Heuristic has " << heuristics.getEngineConfigCount() << " configurations " << std::endl;
+        auto& engine_config = heuristics.getEngineConfig(heuristics.getEngineConfigCount());
+
+        // Try engine configs returned by the heuristics and pick up the first one that works.
+        for (auto& ecfg : engine_config) {
+            try {
+                auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                .setHandle(handle_)
+                                .setEngineConfig(ecfg, opGraph.getTag())
+                                .build();
+                return plan;
+            } catch (cudnn_frontend::cudnnException& e) {
+                continue;
+            }
+        }
+    }
+#endif
+
+    {
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<1>(
+            {"heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (auto status : statuses) {
+            std::cout << cudnn_frontend::to_string(status) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        return cudnn_frontend::ExecutionPlanBuilder()
+            .setHandle(handle_)
+            .setEngineConfig(filtered_configs[0], opGraph.getTag())
+            .build();
+    }
+}
+
+void
+run_conv_scale_bias_add_leaky_relu(int64_t* x_dim,
+                                   int64_t* w_dim,
+                                   int64_t* y_dim,
+                                   int64_t* s_dim,
+                                   int64_t* b_dim,
+                                   int64_t* a_dim,
+                                   cudnnDataType_t dataType,
+                                   int convDim,
+                                   int64_t* conv_padA,
+                                   int64_t* conv_dilationA,
+                                   int64_t* conv_strideA,
+                                   void* devPtrX,
+                                   void* devPtrW,
+                                   void* devPtrY,
+                                   void* devPtrS,
+                                   void* devPtrB,
+                                   void* devPtrA) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, s_dim)
+                           .setStride(4, stride)
+                           .setId('s')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto bTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, b_dim)
+                           .setStride(4, stride)
+                           .setId('b')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(a_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto aTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, a_dim)
+                           .setStride(4, stride)
+                           .setId('a')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('A')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, stride)
+                                    .setId('B')  // after scale
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(dataType)
+                                    .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('C')  // after bias
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+        auto afterAddTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, y_dim)
+                                  .setStride(4, stride)
+                                  .setId('D')  // after add
+                                  .setAlignment(16)
+                                  .setVirtual()
+                                  .setDataType(dataType)
+                                  .build();
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStride(4, stride)
+                           .setId('y')  // output
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << sTensor.describe() << std::endl;
+        std::cout << bTensor.describe() << std::endl;
+        std::cout << aTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+        std::cout << afterScaleTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << afterAddTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the add descriptor
+        auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_ADD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << addDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .setReluLowerClipSlope(0.01)  // leaky relu
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(conv_op.getOutputTensor())
+                            .setbDesc(sTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(scale_op.getOutputTensor())
+                           .setbDesc(bTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create a Add Node.
+        auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(bias_op.getOutputTensor())
+                          .setbDesc(aTensor)
+                          .setyDesc(afterAddTensor)
+                          .setpwDesc(addDesc)
+                          .build();
+        std::cout << add_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(add_op.getOutputTensor())
+                          .setyDesc(yTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution scale bias add activation
+        std::array<cudnn_frontend::Operation const*, 5> ops = {&conv_op, &scale_op, &bias_op, &add_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrS, devPtrB, devPtrA};
+        int64_t uids[]    = {'x', 'y', 'w', 's', 'b', 'a'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(6, data_ptrs)
+                               .setUids(6, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 && e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED) {
+            std::cout << "Fusion with float inputs is only supported on Ampere or later" << std::endl;
+        } else {
+#if (CUDNN_VERSION == 8600) || (CUDNN_VERSION == 8700)
+            if (prop.major == 9) {
+                std::cout << "Hopper GPUs does not have float fused operations support yet\n";
+                return;
+            }
+#endif
+#if (CUDNN_VERSION >= 8300)
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+#endif
+        }
+    }
+}
+
+void
+run_conv_bias_scale_relu(int64_t* x_dim,
+                         int64_t* w_dim,
+                         int64_t* y_dim,
+                         int64_t* b_dim,
+                         int64_t* s_dim,
+                         cudnnDataType_t dataType,
+                         int convDim,
+                         int64_t* conv_padA,
+                         int64_t* conv_dilationA,
+                         int64_t* conv_strideA,
+                         void* devPtrX,
+                         void* devPtrW,
+                         void* devPtrY,
+                         void* devPtrB,
+                         void* devPtrS) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto bTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, b_dim)
+                           .setStride(4, stride)
+                           .setId('b')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, s_dim)
+                           .setStride(4, stride)
+                           .setId('s')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('A')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('B')  // after bias
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, stride)
+                                    .setId('C')  // after scale
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(dataType)
+                                    .build();
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStride(4, stride)
+                           .setId('y')  // output
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << bTensor.describe() << std::endl;
+        std::cout << sTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << afterScaleTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(conv_op.getOutputTensor())
+                           .setbDesc(bTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(bias_op.getOutputTensor())
+                            .setbDesc(sTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(scale_op.getOutputTensor())
+                          .setyDesc(yTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &bias_op, &scale_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrB, devPtrS};
+        int64_t uids[]    = {'x', 'y', 'w', 'b', 's'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(5, data_ptrs)
+                               .setUids(5, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Ampere GPUs" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+#if (CUDNN_VERSION >= 8300)
+            CHECK(false);
+#endif
+        }
+    }
+}
+
+void
+run_serialization_conv_bias_scale_relu(int64_t* x_dim,
+                                       int64_t* w_dim,
+                                       int64_t* y_dim,
+                                       int64_t* b_dim,
+                                       int64_t* s_dim,
+                                       cudnnDataType_t dataType,
+                                       int convDim,
+                                       int64_t* conv_padA,
+                                       int64_t* conv_dilationA,
+                                       int64_t* conv_strideA,
+                                       void* devPtrX,
+                                       void* devPtrW,
+                                       void* devPtrY,
+                                       void* devPtrB,
+                                       void* devPtrS) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto bTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, b_dim)
+                           .setStride(4, stride)
+                           .setId('b')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, s_dim)
+                           .setStride(4, stride)
+                           .setId('s')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('A')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('B')  // after bias
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, stride)
+                                    .setId('C')  // after scale
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(dataType)
+                                    .build();
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStride(4, stride)
+                           .setId('y')  // output
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << bTensor.describe() << std::endl;
+        std::cout << sTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << afterScaleTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(conv_op.getOutputTensor())
+                           .setbDesc(bTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(bias_op.getOutputTensor())
+                            .setbDesc(sTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(scale_op.getOutputTensor())
+                          .setyDesc(yTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &bias_op, &scale_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        std::string plan_json;
+        {
+            // Suppose this is how execution plans are normally created
+            auto plan_tmp = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+            // Generate a JSON serialization of the execution plan
+            plan_json = plan_tmp.getJsonRepresentation();
+            // Optionally save to a file, etc...
+            // std::ofstream output_file("execution_plan.json");
+            // output_file << plan_json;
+            // The temporary execution plan can now be discarded.
+        }
+        // Load the plan from a JSON string.
+        auto plan = cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).loadFromJson(plan_json);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrB, devPtrS};
+        int64_t uids[]    = {'x', 'y', 'w', 'b', 's'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(5, data_ptrs)
+                               .setUids(5, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Ampere GPUs" << std::endl;
+        } else {
+#if (CUDNN_VERSION >= 8400)
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+#endif
+        }
+    }
+}
+
+void
+run_conv_scale_bias_relu_gen_index_selection(int64_t* x_dim,
+                                             int64_t* w_dim,
+                                             int64_t* y_dim,
+                                             int64_t* s_dim,
+                                             int64_t* b_dim,
+                                             int64_t* threshold_dim,
+                                             cudnnDataType_t dataType,
+                                             int convDim,
+                                             int64_t* conv_padA,
+                                             int64_t* conv_dilationA,
+                                             int64_t* conv_strideA,
+                                             int axis,
+                                             void* devPtrX,
+                                             void* devPtrW,
+                                             void* devPtrY,
+                                             void* devPtrS,
+                                             void* devPtrB,
+                                             void* devPtrTopThreshold,
+                                             void* devPtrBottomThreshold) {
+    (void)x_dim;
+    (void)w_dim;
+    (void)y_dim;
+    (void)s_dim;
+    (void)b_dim;
+    (void)threshold_dim;
+    (void)dataType;
+    (void)convDim;
+    (void)conv_padA;
+    (void)conv_dilationA;
+    (void)conv_strideA;
+    (void)axis;
+    (void)devPtrX;
+    (void)devPtrW;
+    (void)devPtrY;
+    (void)devPtrS;
+    (void)devPtrB;
+    (void)devPtrTopThreshold;
+    (void)devPtrBottomThreshold;
+    try {
+#if (CUDNN_VERSION >= 8400)
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("turing") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "run_conv_scale_bias_relu_gen_index_selection: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, s_dim)
+                           .setStride(4, stride)
+                           .setId('s')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto bTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, b_dim)
+                           .setStride(4, stride)
+                           .setId('b')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('A')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .build();
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, stride)
+                                    .setId('B')  // after scale
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(CUDNN_DATA_FLOAT)
+                                    .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('C')  // after bias
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .build();
+
+        auto afterActivationTensor = cudnn_frontend::TensorBuilder()
+                                         .setDim(4, y_dim)
+                                         .setStride(4, stride)
+                                         .setId('D')  // after activation
+                                         .setAlignment(16)
+                                         .setVirtual()
+                                         .setDataType(CUDNN_DATA_FLOAT)
+                                         .build();
+
+        auto genIndexTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, y_dim)
+                                  .setStride(4, stride)
+                                  .setId('I')  // output of the gen index operation
+                                  .setAlignment(16)
+                                  .setVirtual()
+                                  .setDataType(CUDNN_DATA_INT32)
+                                  .build();
+
+        auto maskTopTensor = cudnn_frontend::TensorBuilder()
+                                 .setDim(4, y_dim)
+                                 .setStride(4, stride)
+                                 .setId('m')  // top half of the mask created after the less than
+                                 .setAlignment(16)
+                                 .setVirtual()
+                                 .setDataType(CUDNN_DATA_BOOLEAN)
+                                 .build();
+
+        auto maskBottomTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, stride)
+                                    .setId('n')  // bottom half of the mask
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(CUDNN_DATA_BOOLEAN)
+                                    .build();
+
+        auto maskTensor = cudnn_frontend::TensorBuilder()
+                              .setDim(4, y_dim)
+                              .setStride(4, stride)
+                              .setId('M')  // OR of the top and bottom masks
+                              .setAlignment(16)
+                              .setVirtual()
+                              .setDataType(CUDNN_DATA_BOOLEAN)
+                              .build();
+
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStride(4, stride)
+                           .setId('y')  // output
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(threshold_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto thresholdTopTensor = cudnn_frontend::TensorBuilder()
+                                      .setDim(4, threshold_dim)
+                                      .setStride(4, stride)
+                                      .setId('t')  // threshold for creating the top mask
+                                      .setAlignment(16)
+                                      .setDataType(CUDNN_DATA_INT32)
+                                      .build();
+
+        auto thresholdBottomTensor = cudnn_frontend::TensorBuilder()
+                                         .setDim(4, threshold_dim)
+                                         .setStride(4, stride)
+                                         .setId('u')  // threshold for creating the bottom mask
+                                         .setAlignment(16)
+                                         .setDataType(CUDNN_DATA_INT32)
+                                         .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << bTensor.describe() << std::endl;
+        std::cout << sTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << afterScaleTensor.describe() << std::endl;
+        std::cout << afterActivationTensor.describe() << std::endl;
+        std::cout << genIndexTensor.describe() << std::endl;
+        std::cout << maskTopTensor.describe() << std::endl;
+        std::cout << maskBottomTensor.describe() << std::endl;
+        std::cout << maskTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+        std::cout << thresholdTopTensor.describe() << std::endl;
+        std::cout << thresholdBottomTensor.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the genIndex descriptor
+        auto genIndexDesc = cudnn_frontend::PointWiseDescBuilder()
+                                .setMode(CUDNN_POINTWISE_GEN_INDEX)
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .setAxis(axis)
+                                .build();
+        std::cout << genIndexDesc.describe() << std::endl;
+
+        // Define the lessThan descriptor
+        auto lessThanDesc = cudnn_frontend::PointWiseDescBuilder()
+                                .setMode(CUDNN_POINTWISE_CMP_LT)
+                                .setComputeType(CUDNN_DATA_FLOAT)
+                                .build();
+        std::cout << lessThanDesc.describe() << std::endl;
+
+        // Define the greaterThan descriptor
+        auto greaterThanDesc = cudnn_frontend::PointWiseDescBuilder()
+                                   .setMode(CUDNN_POINTWISE_CMP_GT)
+                                   .setComputeType(CUDNN_DATA_FLOAT)
+                                   .build();
+        std::cout << greaterThanDesc.describe() << std::endl;
+
+        // Define the logical_or descriptor
+        auto logicalOrDesc = cudnn_frontend::PointWiseDescBuilder()
+                                 .setMode(CUDNN_POINTWISE_LOGICAL_OR)
+                                 .setComputeType(CUDNN_DATA_BOOLEAN)
+                                 .build();
+        std::cout << logicalOrDesc.describe() << std::endl;
+
+        // Define the binary_selection descriptor
+        auto selectionDesc = cudnn_frontend::PointWiseDescBuilder()
+                                 .setMode(CUDNN_POINTWISE_BINARY_SELECT)
+                                 .setComputeType(CUDNN_DATA_FLOAT)
+                                 .build();
+        std::cout << selectionDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(afterConvTensor)
+                            .setbDesc(sTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(afterScaleTensor)
+                           .setbDesc(bTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(afterBiasTensor)
+                          .setyDesc(afterActivationTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create a Gen_Index Node.
+        auto genIndex_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                               .setxDesc(afterActivationTensor)
+                               .setyDesc(genIndexTensor)
+                               .setpwDesc(genIndexDesc)
+                               .build();
+        std::cout << genIndex_op.describe() << std::endl;
+
+        // Create a LessThan Node.
+        auto lessThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                               .setxDesc(genIndexTensor)
+                               .setbDesc(thresholdTopTensor)
+                               .setyDesc(maskTopTensor)
+                               .setpwDesc(lessThanDesc)
+                               .build();
+        std::cout << lessThan_op.describe() << std::endl;
+
+        // Create a GreaterThan Node.
+        auto greaterThan_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                                  .setxDesc(genIndexTensor)
+                                  .setbDesc(thresholdBottomTensor)
+                                  .setyDesc(maskBottomTensor)
+                                  .setpwDesc(greaterThanDesc)
+                                  .build();
+        std::cout << greaterThan_op.describe() << std::endl;
+
+        // Create a LogicalOr Node.
+        auto logicalOr_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                                .setxDesc(maskTopTensor)
+                                .setbDesc(maskBottomTensor)
+                                .setyDesc(maskTensor)
+                                .setpwDesc(logicalOrDesc)
+                                .build();
+        std::cout << logicalOr_op.describe() << std::endl;
+
+        // Create a Binary_Selection Node.
+        auto selection_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                                .setxDesc(afterConvTensor)
+                                .setbDesc(afterActivationTensor)
+                                .settDesc(maskTensor)
+                                .setyDesc(yTensor)
+                                .setpwDesc(selectionDesc)
+                                .build();
+        std::cout << selection_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 9> ops = {&conv_op,
+                                                               &scale_op,
+                                                               &bias_op,
+                                                               &act_op,
+                                                               &genIndex_op,
+                                                               &lessThan_op,
+                                                               &greaterThan_op,
+                                                               &logicalOr_op,
+                                                               &selection_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // How many engines support this operation graph ?
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrS, devPtrB, devPtrTopThreshold, devPtrBottomThreshold};
+        int64_t uids[]    = {'x', 'y', 'w', 's', 'b', 't', 'u'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(7, data_ptrs)
+                               .setUids(7, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+#endif
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        if (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH) {
+            return;
+        }
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_conv_scale_bias_relu_int8(int64_t* x_dim,
+                              int64_t* w_dim,
+                              int64_t* y_dim,
+                              int64_t* s_dim,
+                              int64_t* b_dim,
+                              int convDim,
+                              int64_t* conv_padA,
+                              int64_t* conv_dilationA,
+                              int64_t* conv_strideA,
+                              void* devPtrX,
+                              void* devPtrW,
+                              void* devPtrY,
+                              void* devPtrS,
+                              void* devPtrB) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        if (check_device_arch_newer_than("turing") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "run_conv_scale_bias_relu_int8: Sample requires Turing or above GPU");
+        }
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(CUDNN_DATA_INT8)
+                           .build();
+        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_INT8)
+                           .build();
+        generateStrides(s_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, s_dim)
+                           .setStride(4, stride)
+                           .setId('s')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_FLOAT)
+                           .build();
+
+        generateStrides(b_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto bTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, b_dim)
+                           .setStride(4, stride)
+                           .setId('b')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_FLOAT)
+                           .build();
+
+        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('A')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(CUDNN_DATA_INT32)
+                                   .build();
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, stride)
+                                    .setId('B')  // after scale
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(CUDNN_DATA_FLOAT)
+                                    .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('C')  // after bias
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .build();
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStride(4, stride)
+                           .setId('y')  // output
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_INT8)
+                           .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << bTensor.describe() << std::endl;
+        std::cout << sTensor.describe() << std::endl;
+        std::cout << afterConvTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << afterScaleTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_INT32)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(conv_op.getOutputTensor())
+                            .setbDesc(sTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(scale_op.getOutputTensor())
+                           .setbDesc(bTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(bias_op.getOutputTensor())
+                          .setyDesc(yTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 4> ops = {&conv_op, &scale_op, &bias_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // How many engines support this operation graph ?
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrS, devPtrB};
+        int64_t uids[]    = {'x', 'y', 'w', 's', 'b'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(5, data_ptrs)
+                               .setUids(5, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        // this example is only for Turing and later cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Turing GPUs" << std::endl;
+        } else {
+#if (CUDNN_VERSION == 8600)
+            if (prop.major == 9) {
+                std::cout << "Hopper GPUs does not have int8 fused operations support yet\n";
+                return;
+            }
+#endif
+#if (CUDNN_VERSION >= 8300)
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+#endif
+        }
+    }
+}
+
+void
+run_pool_scale_bias_relu_int8(int64_t* x_dim,
+                              int64_t* y_dim,
+                              int64_t* s_dim,
+                              int64_t* b_dim,
+                              void* devPtrX,
+                              void* devPtrY,
+                              void* devPtrS,
+                              void* devPtrB,
+                              cudnnDataType_t compType,
+                              cudnnNanPropagation_t const nanOpt,
+                              cudnn_frontend::ResampleMode_t const mode,
+                              cudnn_frontend::PaddingMode_t const padding_mode,
+                              int64_t nbSpatialDims,
+                              double alpha,
+                              double beta,
+                              int64_t* windowDimA,
+                              int64_t* prePaddingA,
+                              int64_t* postPaddingA,
+                              int64_t* strideA) {
+    (void)nbSpatialDims;
+    (void)alpha;
+    (void)beta;
+    (void)windowDimA;
+    (void)prePaddingA;
+    (void)postPaddingA;
+    (void)strideA;
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t strideTensor[4];
+        generateStrides(x_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, strideTensor)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(CUDNN_DATA_INT8)
+                           .build();
+        generateStrides(s_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto sTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, s_dim)
+                           .setStride(4, strideTensor)
+                           .setId('s')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_FLOAT)
+                           .build();
+
+        generateStrides(b_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto bTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, b_dim)
+                           .setStride(4, strideTensor)
+                           .setId('b')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_FLOAT)
+                           .build();
+
+        generateStrides(y_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto afterPoolTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, strideTensor)
+                                   .setId('A')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(compType)
+                                   .build();
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, strideTensor)
+                                    .setId('B')  // after scale
+                                    .setAlignment(16)
+                                    .setVirtual()
+                                    .setDataType(compType)
+                                    .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, strideTensor)
+                                   .setId('C')  // after bias
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(compType)
+                                   .build();
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStride(4, strideTensor)
+                           .setId('y')  // output
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_INT8)
+                           .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << bTensor.describe() << std::endl;
+        std::cout << sTensor.describe() << std::endl;
+        std::cout << afterPoolTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << afterScaleTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+
+        // Define the resample descriptor
+        auto poolDesc = cudnn_frontend::ResampleDescBuilder_v8()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setNanPropagation(nanOpt)
+                            .setResampleMode(mode)
+                            .setPaddingMode(padding_mode)
+                            .setSpatialDim(nbSpatialDims, windowDimA)
+                            .setSpatialStride(nbSpatialDims, strideA)
+                            .setPrePadding(nbSpatialDims, prePaddingA)
+                            .setPostPadding(nbSpatialDims, postPaddingA)
+                            .build();
+        std::cout << "Initialized Pool Desc" << std::endl;
+        std::cout << poolDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc =
+            cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_MUL).setComputeType(compType).build();
+        std::cout << "Initialized Scale Desc" << std::endl;
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc =
+            cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_ADD).setComputeType(compType).build();
+        std::cout << "Initialized Bias Desc" << std::endl;
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc =
+            cudnn_frontend::PointWiseDescBuilder().setMode(CUDNN_POINTWISE_RELU_FWD).setComputeType(compType).build();
+        std::cout << "Initialized Activation Desc" << std::endl;
+        std::cout << actDesc.describe() << std::endl;
+
+#if (CUDNN_VERSION >= 8500)
+        // Create a Resample Node
+        auto pool_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setyDesc(afterPoolTensor)
+                           .setResampleDesc(poolDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << pool_op.describe() << std::endl;
+#endif
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+#if (CUDNN_VERSION >= 8500)
+                            .setxDesc(pool_op.getOutputTensor())
+#endif
+                            .setbDesc(sTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(scale_op.getOutputTensor())
+                           .setbDesc(bTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(bias_op.getOutputTensor())
+                          .setyDesc(yTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+#if (CUDNN_VERSION >= 8500)
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 4> ops = {&pool_op, &scale_op, &bias_op, &act_op};
+#else
+        std::array<cudnn_frontend::Operation const*, 3> ops = {&scale_op, &bias_op, &act_op};
+#endif
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // How many engines support this operation graph ?
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        // Create the variant pack and associate with the data pointers
+        void* data_ptrs[] = {devPtrX, devPtrY, devPtrS, devPtrB};
+        int64_t uids[]    = {'x', 'y', 's', 'b'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(4, data_ptrs)
+                               .setUids(4, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        // Trigger the execute operation
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        std::cout << "EXECUTE SUCCESS" << std::endl;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        std::cout << "Sample not executed for cuDNN version " << CUDNN_VERSION << std::endl;
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Ampere GPUs" << std::endl;
+        } else {
+#if (CUDNN_VERSION >= 8500)
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+#endif
+        }
+    }
+}
+
+void
+run_matmul_bias_gelu(int64_t* a_dim,
+                     int64_t* b_dim,
+                     int64_t* c_dim,
+                     int64_t* z_dim,
+                     cudnnDataType_t dataType,
+                     void* devPtrA,
+                     void* devPtrB,
+                     void* devPtrC,
+                     void* devPtrZ,
+                     void* devPtrAfterZ) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("ampere") == false && dataType == CUDNN_DATA_FLOAT) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_matmul_bias_gelu: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[3];
+        // the intension is to compute stride for a [1, M, K] matrix with K in the inner most dimension, and
+        // CUDNN_TENSOR_NCHW is a borrowed notation
+        generateStrides(a_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto aMatrixTensor = cudnn_frontend::TensorBuilder()
+                                 .setDim(3, a_dim)
+                                 .setStride(3, stride)
+                                 .setId('a')
+                                 .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                 .setDataType(dataType)
+                                 .build();
+        generateStrides(b_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto bMatrixTensor = cudnn_frontend::TensorBuilder()
+                                 .setDim(3, b_dim)
+                                 .setStride(3, stride)
+                                 .setId('b')
+                                 .setAlignment(16)
+                                 .setDataType(dataType)
+                                 .build();
+
+        generateStrides(z_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto biasTensor = cudnn_frontend::TensorBuilder()
+                              .setDim(3, z_dim)
+                              .setStride(3, stride)
+                              .setId('z')
+                              .setAlignment(16)
+                              .setDataType(dataType)
+                              .build();
+
+        generateStrides(c_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto afterMatMulTensor = cudnn_frontend::TensorBuilder()
+                                     .setDim(3, c_dim)
+                                     .setStride(3, stride)
+                                     .setId('A')  // after matmul
+                                     .setAlignment(16)
+                                     .setVirtual()
+                                     .setDataType(dataType)
+                                     .build();
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(3, c_dim)
+                                   .setStride(3, stride)
+                                   .setId('B')  // after bias
+                                   .setAlignment(16)
+                                   .setDataType(dataType)
+                                   .build();
+        auto outputTensor = cudnn_frontend::TensorBuilder()
+                                .setDim(3, c_dim)
+                                .setStride(3, stride)
+                                .setId('c')  // output after gelu
+                                .setAlignment(16)
+                                .setDataType(dataType)
+                                .build();
+
+        std::cout << aMatrixTensor.describe() << std::endl;
+        std::cout << bMatrixTensor.describe() << std::endl;
+        std::cout << biasTensor.describe() << std::endl;
+        std::cout << afterMatMulTensor.describe() << std::endl;
+        std::cout << afterBiasTensor.describe() << std::endl;
+        std::cout << outputTensor.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+#if (CUDNN_VERSION >= 8500)
+                           .setMode(CUDNN_POINTWISE_GELU_APPROX_TANH_FWD)
+#else
+                           .setMode(CUDNN_POINTWISE_GELU_FWD)
+#endif
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        // Define the matmul desc
+        auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmulDesc.describe() << std::endl;
+
+        // Create a matmul Node
+        auto matmul_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                             .setaMatDesc(aMatrixTensor)
+                             .setbMatDesc(bMatrixTensor)
+                             .setcMatDesc(afterMatMulTensor)
+                             .setmatmulDesc(matmulDesc)
+                             .build();
+        std::cout << matmul_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(matmul_op.getOutputTensor())
+                           .setbDesc(biasTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(bias_op.getOutputTensor())
+                          .setyDesc(outputTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is matmul bias activation
+        std::array<cudnn_frontend::Operation const*, 3> ops = {&matmul_op, &bias_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrA, devPtrB, devPtrC, devPtrZ, devPtrAfterZ};
+        int64_t uids[]    = {'a', 'b', 'c', 'z', 'B'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(5, data_ptrs)
+                               .setUids(5, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with float inputs is only supported on Ampere or later" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+#if (CUDNN_VERSION >= 8300)
+            CHECK(false);
+#endif
+        }
+    }
+}
+
+void
+run_conv_drelu(int64_t* x_dim,
+               int64_t* pad,
+               int64_t* convstride,
+               int64_t* dilation,
+               int64_t* w_dim,
+               int64_t* y_dim,
+               cudnnDataType_t dataType,
+               void* dev_ptr_x,
+               void* dev_ptr_w,
+               void* dev_ptr_y,
+               void* dev_ptr_bwd_act_x) {
+    try {
+        int convDim = 2;
+
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("ampere") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_conv_drelu: Sample requires Ampere or above GPU");
+        }
+        int64_t x_id         = 101;
+        int64_t w_id         = 102;
+        int64_t bwd_act_x_id = 201;
+        int64_t y_id         = 301;
+
+        int64_t after_conv_id = 1001;
+
+        int64_t x_stride_padded[4];
+        int64_t y_stride_padded[4];
+        int64_t w_stride_padded[4];
+
+        generateStrides(w_dim, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(x_dim, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(y_dim, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
+
+        auto x_tensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, x_dim)
+                            .setStride(4, x_stride_padded)
+                            .setId(x_id)
+                            .setAlignment(4)
+                            .setDataType(dataType)
+                            .build();
+
+        auto w_tensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, w_dim)
+                            .setStride(4, w_stride_padded)
+                            .setId(w_id)
+                            .setAlignment(4)
+                            .setDataType(dataType)
+                            .build();
+
+        auto after_conv_tensor = cudnn_frontend::TensorBuilder()
+                                     .setDim(4, y_dim)
+                                     .setStride(4, y_stride_padded)
+                                     .setId(after_conv_id)
+                                     .setAlignment(4)
+                                     .setVirtual()
+                                     .setDataType(CUDNN_DATA_FLOAT)
+                                     .build();
+
+        auto bwd_act_x_tensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, y_dim)
+                                    .setStride(4, y_stride_padded)
+                                    .setId(bwd_act_x_id)
+                                    .setAlignment(4)
+                                    .setDataType(dataType)
+                                    .build();
+
+        auto after_activation_tensor = cudnn_frontend::TensorBuilder()
+                                           .setDim(4, y_dim)
+                                           .setStride(4, y_stride_padded)
+                                           .setId(y_id)
+                                           .setAlignment(4)
+                                           .setDataType(dataType)
+                                           .build();
+
+        std::cout << x_tensor.describe() << std::endl;
+        std::cout << w_tensor.describe() << std::endl;
+        std::cout << after_conv_tensor.describe() << std::endl;
+        std::cout << bwd_act_x_tensor.describe() << std::endl;
+        std::cout << after_activation_tensor.describe() << std::endl;
+
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, convstride)
+                            .setPrePadding(convDim, pad)
+                            .setPostPadding(convDim, pad)
+                            .setDilation(convDim, dilation)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(x_tensor)
+                           .setwDesc(w_tensor)
+                           .setyDesc(after_conv_tensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(1.0f)
+                           .setBeta(0.0f)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_BWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setdyDesc(after_conv_tensor)
+                          .setxDesc(bwd_act_x_tensor)
+                          .setdxDesc(after_activation_tensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        std::array<cudnn_frontend::Operation const*, 2> ops = {&conv_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {dev_ptr_x, dev_ptr_w, dev_ptr_bwd_act_x, dev_ptr_y};
+        int64_t uids[]    = {x_id, w_id, bwd_act_x_id, y_id};
+
+        auto variantPack = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(4, data_ptrs)
+                               .setUids(4, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        if (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH) {
+            return;
+        }
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+
+        CHECK(false);
+    }
+}
+
+void
+run_dgrad_drelu(int64_t* dx_dim,
+                int64_t* pad,
+                int64_t* convstride,
+                int64_t* dilation,
+                int64_t* w_dim,
+                int64_t* dy_dim,
+                cudnnDataType_t dataType,
+                void* dev_ptr_dx,
+                void* dev_ptr_w,
+                void* dev_ptr_dy,
+                void* dev_ptr_bwd_act_x) {
+    try {
+        int convDim = 2;
+        if (check_device_arch_newer_than("ampere") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_dgrad_drelu: Sample requires Ampere or above GPU");
+        }
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        int64_t dx_id        = 101;
+        int64_t w_id         = 102;
+        int64_t bwd_act_x_id = 201;
+        int64_t dy_id        = 301;
+
+        int64_t after_dgrad_id = 1001;
+
+        int64_t dx_stride[4];
+        int64_t dy_stride[4];
+        int64_t w_stride[4];
+
+        generateStrides(w_dim, w_stride, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(dx_dim, dx_stride, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(dy_dim, dy_stride, 4, CUDNN_TENSOR_NHWC);
+
+        auto after_dgrad_dx_tensor = cudnn_frontend::TensorBuilder()
+                                         .setDim(4, dx_dim)
+                                         .setStride(4, dx_stride)
+                                         .setId(after_dgrad_id)
+                                         .setAlignment(4)
+                                         .setVirtual()
+                                         .setDataType(dataType)
+                                         .build();
+
+        auto w_tensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, w_dim)
+                            .setStride(4, w_stride)
+                            .setId(w_id)
+                            .setAlignment(4)
+                            .setDataType(dataType)
+                            .build();
+
+        auto dy_tensor = cudnn_frontend::TensorBuilder()
+                             .setDim(4, dy_dim)
+                             .setStride(4, dy_stride)
+                             .setId(dy_id)
+                             .setAlignment(4)
+                             .setDataType(dataType)
+                             .build();
+
+        auto bwd_act_x_tensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, dx_dim)
+                                    .setStride(4, dx_stride)
+                                    .setId(bwd_act_x_id)
+                                    .setAlignment(4)
+                                    .setDataType(dataType)
+                                    .build();
+
+        auto after_bwd_activation_dx_tensor = cudnn_frontend::TensorBuilder()
+                                                  .setDim(4, dx_dim)
+                                                  .setStride(4, dx_stride)
+                                                  .setId(dx_id)
+                                                  .setAlignment(4)
+                                                  .setDataType(dataType)
+                                                  .build();
+
+        std::cout << after_dgrad_dx_tensor.describe() << std::endl;
+        std::cout << w_tensor.describe() << std::endl;
+        std::cout << dy_tensor.describe() << std::endl;
+        std::cout << bwd_act_x_tensor.describe() << std::endl;
+        std::cout << after_bwd_activation_dx_tensor.describe() << std::endl;
+
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, convstride)
+                            .setPrePadding(convDim, pad)
+                            .setPostPadding(convDim, pad)
+                            .setDilation(convDim, dilation)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
+                           .setdyDesc(dy_tensor)
+                           //    .setyDesc(dy_tensor)
+                           .setwDesc(w_tensor)
+                           .setdxDesc(after_dgrad_dx_tensor)
+                           //    .setxDesc(after_dgrad_dx_tensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(1.0f)
+                           .setBeta(0.0f)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_BWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setdyDesc(after_dgrad_dx_tensor)
+                          .setxDesc(bwd_act_x_tensor)
+                          .setdxDesc(after_bwd_activation_dx_tensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+
+        std::array<cudnn_frontend::Operation const*, 2> ops = {&conv_op, &act_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {dev_ptr_dx, dev_ptr_w, dev_ptr_bwd_act_x, dev_ptr_dy};
+        int64_t uids[]    = {dx_id, w_id, bwd_act_x_id, dy_id};
+
+        auto variantPack = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(4, data_ptrs)
+                               .setUids(4, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        if (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH) {
+            return;
+        }
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+void
+run_matmul_dgelu_dbias(const int64_t* dy_dim,
+                       const int64_t* w_dim,
+                       const int64_t* dx_dim,
+                       const int64_t* dbias_dim,
+                       cudnnDataType_t dataType,
+                       void* dev_ptr_dy,
+                       void* dev_ptr_w,
+                       void* dev_ptr_bwd_act_x,
+                       void* dev_ptr_dx,
+                       void* dev_ptr_dbias) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("ampere") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_matmul_dgelu_dbias: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[3];
+
+        // Use the following UIDs for tensors
+        int64_t dy_uid        = 101;
+        int64_t w_uid         = 102;
+        int64_t bwd_act_x_uid = 103;
+        int64_t dx_uid        = 104;
+        int64_t dbias_uid     = 105;
+
+        // Create tensor descriptor for DY matrix
+        generateStrides(dy_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto dyMatrixTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(3, dy_dim)
+                                  .setStride(3, stride)
+                                  .setId(dy_uid)
+                                  .setAlignment(16)
+                                  .setDataType(dataType)
+                                  .build();
+        std::cout << dyMatrixTensor.describe() << std::endl;
+
+        // Create tensor descriptor for weight matrix
+        generateStrides(w_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto wMatrixTensor = cudnn_frontend::TensorBuilder()
+                                 .setDim(3, w_dim)
+                                 .setStride(3, stride)
+                                 .setId(w_uid)
+                                 .setAlignment(16)
+                                 .setDataType(dataType)
+                                 .build();
+        std::cout << wMatrixTensor.describe() << std::endl;
+
+        // Create tensor descriptor for dx matrix
+        generateStrides(dx_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto dataGrad1MatrixTensor = cudnn_frontend::TensorBuilder()
+                                         .setDim(3, dx_dim)
+                                         .setStride(3, stride)
+                                         .setId('X')
+                                         .setAlignment(16)
+                                         .setDataType(dataType)
+                                         .setVirtual(true)
+                                         .build();
+        std::cout << dataGrad1MatrixTensor.describe() << std::endl;
+
+        // Create tensor descriptor for geluInput matrix
+        generateStrides(dx_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto geluInputMatrixTensor = cudnn_frontend::TensorBuilder()
+                                         .setDim(3, dx_dim)
+                                         .setStride(3, stride)
+                                         .setId(bwd_act_x_uid)
+                                         .setAlignment(16)
+                                         .setDataType(dataType)
+                                         .build();
+        std::cout << geluInputMatrixTensor.describe() << std::endl;
+
+        // Create tensor descriptor for output of backwardGelu matrix
+        generateStrides(dx_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto backwardGeluMatrixTensor = cudnn_frontend::TensorBuilder()
+                                            .setDim(3, dx_dim)
+                                            .setStride(3, stride)
+                                            .setId(dx_uid)
+                                            .setAlignment(16)
+                                            .setDataType(dataType)
+                                            .build();
+        std::cout << backwardGeluMatrixTensor.describe() << std::endl;
+
+        // Create tensor descriptor for output of biasGrad(reduction) matrix
+        generateStrides(dbias_dim, stride, 3, CUDNN_TENSOR_NCHW);
+        auto backwardBiasMatrixTensor = cudnn_frontend::TensorBuilder()
+                                            .setDim(3, dbias_dim)
+                                            .setStride(3, stride)
+                                            .setId(dbias_uid)
+                                            .setAlignment(16)
+                                            .setDataType(CUDNN_DATA_FLOAT)
+                                            .build();
+        std::cout << backwardBiasMatrixTensor.describe() << std::endl;
+
+        auto matmulDesc = cudnn_frontend::MatMulDescBuilder().setComputeType(CUDNN_DATA_FLOAT).build();
+        std::cout << matmulDesc.describe() << std::endl;
+
+        auto geluDesc = cudnn_frontend::PointWiseDescBuilder()
+#if (CUDNN_VERSION >= 8500)
+                            .setMode(CUDNN_POINTWISE_GELU_APPROX_TANH_BWD)
+#else
+                            .setMode(CUDNN_POINTWISE_GELU_BWD)
+#endif
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << geluDesc.describe() << std::endl;
+
+        // Define the reduction descriptor
+        auto reductionDesc = cudnn_frontend::ReductionDescBuilder()
+                                 .setComputeType(CUDNN_DATA_FLOAT)
+                                 .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                 .build();
+        std::cout << reductionDesc.describe() << std::endl;
+
+        // Create a matmul Node for Dgrad
+        auto matmulOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR)
+                            .setaMatDesc(dyMatrixTensor)
+                            .setbMatDesc(wMatrixTensor)
+                            .setcMatDesc(dataGrad1MatrixTensor)
+                            .setmatmulDesc(matmulDesc)
+                            .build();
+        std::cout << matmulOp.describe() << std::endl;
+
+        // Create a matmul Node for dGeLU
+        auto geluOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setdyDesc(matmulOp.getOutputTensor())
+                          .setxDesc(geluInputMatrixTensor)
+                          .setdxDesc(backwardGeluMatrixTensor)
+                          .setpwDesc(geluDesc)
+                          .build();
+        std::cout << geluOp.describe() << std::endl;
+
+        // Create a reduction add Node.
+        auto reduction_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                .setxDesc(backwardGeluMatrixTensor)
+                                .setyDesc(backwardBiasMatrixTensor)
+                                .setreductionDesc(reductionDesc)
+                                .build();
+        std::cout << reduction_op.describe() << std::endl;
+
+        // Create an Operation Graph.
+        std::array<cudnn_frontend::Operation const*, 3> ops = {&matmulOp, &geluOp, &reduction_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {dev_ptr_dy, dev_ptr_w, dev_ptr_dx, dev_ptr_bwd_act_x, dev_ptr_dbias};
+        int64_t uids[]    = {dy_uid, w_uid, dx_uid, bwd_act_x_uid, dbias_uid};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(5, data_ptrs)
+                               .setUids(5, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with float inputs is only supported on Ampere or later" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        }
+    }
+}
+
+void
+run_conv_reduction(int64_t* x_dim,
+                   int64_t* w_dim,
+                   int64_t* y_dim,
+                   int64_t* r_dim,
+                   cudnnDataType_t dataType,
+                   int convDim,
+                   int64_t* conv_padA,
+                   int64_t* conv_dilationA,
+                   int64_t* conv_strideA,
+                   void* devPtrX,
+                   void* devPtrW,
+                   void* devPtrR) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("ampere") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_conv_reduction: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(x_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(dataType)
+                           .build();
+        generateStrides(w_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, w_dim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(dataType)
+                           .build();
+
+        generateStrides(r_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto rTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, r_dim)
+                           .setStride(4, stride)
+                           .setId('r')  // output
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_FLOAT)
+                           .build();
+
+        generateStrides(y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, y_dim)
+                                   .setStride(4, stride)
+                                   .setId('y')  // after conv
+                                   .setAlignment(16)
+                                   .setVirtual()
+                                   .setDataType(dataType)
+                                   .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << rTensor.describe() << std::endl;
+
+        std::cout << afterConvTensor.describe() << std::endl;
+
+        // Define the reduction descriptor
+        auto redunctionDesc = cudnn_frontend::ReductionDescBuilder()
+                                  .setComputeType(CUDNN_DATA_FLOAT)
+                                  .setReductionOp(CUDNN_REDUCE_TENSOR_ADD)
+                                  .build();
+        std::cout << redunctionDesc.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create a reduction add Node.
+        auto reduction_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR)
+                                .setxDesc(conv_op.getOutputTensor())
+                                .setyDesc(rTensor)
+                                .setreductionDesc(redunctionDesc)
+                                .build();
+        std::cout << reduction_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution reduction add
+        std::array<cudnn_frontend::Operation const*, 2> ops = {&conv_op, &reduction_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        auto plan = get_execplan_from_heuristics_else_fall_back(std::move(opGraph), handle_);
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+        void* data_ptrs[] = {devPtrX, devPtrW, devPtrR};
+        int64_t uids[]    = {'x', 'w', 'r'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        if (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH) {
+            return;
+        }
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+    }
+}
+
+cudnnStatus_t
+run_bn_conv_gen_stat(int64_t* xTensorDim,
+                     int64_t* wTensorDim,
+                     int64_t* yTensorDim,
+                     int64_t* scaleTensorDim,
+                     int convDim,
+                     int64_t* conv_padA,
+                     int64_t* conv_dilationA,
+                     int64_t* conv_strideA,
+                     void* XdevPtr,
+                     void* WdevPtr,
+                     void* YdevPtr,
+                     void* scaledevPtr,
+                     void* biasdevPtr,
+                     void* sumdevPtr,
+                     void* sqSumdevPtr) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(xTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, xTensorDim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(CUDNN_DATA_HALF)
+                           .build();
+
+        auto afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                    .setDim(4, xTensorDim)
+                                    .setStride(4, stride)
+                                    .setId('d')
+                                    .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                    .setDataType(CUDNN_DATA_FLOAT)
+                                    .setVirtual()
+                                    .build();
+
+        auto afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, xTensorDim)
+                                   .setStride(4, stride)
+                                   .setId('e')
+                                   .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .setVirtual()
+                                   .build();
+
+        auto afterReluTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, xTensorDim)
+                                   .setStride(4, stride)
+                                   .setId('f')
+                                   .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                                   .setDataType(CUDNN_DATA_FLOAT)
+                                   .setVirtual()
+                                   .build();
+
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto scaleTensor = cudnn_frontend::TensorBuilder()
+                               .setDim(4, scaleTensorDim)
+                               .setStride(4, stride)
+                               .setId('s')
+                               .setAlignment(16)
+                               .setDataType(CUDNN_DATA_HALF)
+                               .build();
+
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto biasTensor = cudnn_frontend::TensorBuilder()
+                              .setDim(4, scaleTensorDim)
+                              .setStride(4, stride)
+                              .setId('b')
+                              .setAlignment(16)
+                              .setDataType(CUDNN_DATA_HALF)
+                              .build();
+        generateStrides(wTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, wTensorDim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_HALF)
+                           .build();
+
+        generateStrides(yTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, yTensorDim)
+                           .setStride(4, stride)
+                           .setId('y')  // after conv
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_HALF)
+                           .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Define the bias descriptor
+        auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+                            .setMode(CUDNN_POINTWISE_ADD)
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .build();
+        std::cout << biasDesc.describe() << std::endl;
+
+        // Define the activation descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .setReluLowerClipSlope(0.01)  // leaky relu
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+        std::cout << "Creating OPs " << std::endl;
+        // Create a Multiplication Node with scaling parameters.
+        auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                            .setxDesc(xTensor)
+                            .setbDesc(scaleTensor)
+                            .setyDesc(afterScaleTensor)
+                            .setpwDesc(scaleDesc)
+                            .build();
+        std::cout << scale_op.describe() << std::endl;
+
+        // Create a Bias Node.
+        auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                           .setxDesc(afterScaleTensor)
+                           .setbDesc(biasTensor)
+                           .setyDesc(afterBiasTensor)
+                           .setpwDesc(biasDesc)
+                           .build();
+        std::cout << bias_op.describe() << std::endl;
+
+        // Create an Activation Node.
+        auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                          .setxDesc(afterBiasTensor)
+                          .setyDesc(afterReluTensor)
+                          .setpwDesc(actDesc)
+                          .build();
+        std::cout << act_op.describe() << std::endl;
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(afterReluTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(yTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sumTensor = cudnn_frontend::TensorBuilder()
+                             .setDim(4, scaleTensorDim)
+                             .setStride(4, stride)
+                             .setId('u')
+                             .setAlignment(16)
+                             .setDataType(CUDNN_DATA_FLOAT)
+                             .build();
+
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto sqsumTensor = cudnn_frontend::TensorBuilder()
+                               .setDim(4, scaleTensorDim)
+                               .setStride(4, stride)
+                               .setId('v')
+                               .setAlignment(16)
+                               .setDataType(CUDNN_DATA_FLOAT)
+                               .build();
+
+        // Create a genstats node
+        auto genstat_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR)
+                              .setxDesc(yTensor)
+                              .setComputeType(CUDNN_DATA_FLOAT)
+                              .setGenStatsMode(CUDNN_GENSTATS_SUM_SQSUM)
+                              .setSumDesc(sumTensor)
+                              .setSqSumDesc(sqsumTensor)
+                              .build();
+        std::cout << genstat_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is scale bias Relu conv gen_stats
+        std::array<cudnn_frontend::Operation const*, 5> ops = {&scale_op, &bias_op, &conv_op, &act_op, &genstat_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        cudnn_frontend::ManagedOpaqueDescriptor plan_desc = nullptr;
+        int64_t workspace_size                            = 0;
+        cudnnStatus_t st                                  = CUDNN_STATUS_SUCCESS;
+        for (auto& config : filtered_configs) {
+            try {
+                auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                .setHandle(handle_)
+                                .setEngineConfig(config, opGraph.getTag())
+                                .build();
+                std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+                workspace_size = plan.getWorkspaceSize();
+                std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+                plan_desc = plan.get_desc();
+            } catch (cudnn_frontend::cudnnException& e) {
+                st = e.getCudnnStatus();
+                continue;
+            }
+        }
+        if (plan_desc == nullptr) {
+            std::cout << "No plan found implementing the operation graph" << std::endl;
+            return st;
+        }
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        void* data_ptrs[] = {XdevPtr, WdevPtr, YdevPtr, scaledevPtr, biasdevPtr, sumdevPtr, sqSumdevPtr};
+        int64_t uids[]    = {'x', 'w', 'y', 's', 'b', 'u', 'v'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(7, data_ptrs)
+                               .setUids(7, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status =
+            cudnnBackendExecute(handle_, plan_desc->get_backend_descriptor(), variantPack.get_raw_desc());
+
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+        return status;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 && e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED) {
+            std::cout << "Fusion with float inputs is only supported on Ampere or later" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+#if (CUDNN_VERSION >= 8300)
+            CHECK(false);
+#endif
+        }
+        return CUDNN_STATUS_SUCCESS;
+    }
+}
+
+void
+run_bn_finalize(int64_t* perChannelSum,
+                int64_t* epsilon,
+
+                void* YSumdevPtr,
+                void* YSqSumdevPtr,
+                void* scaledevPtr,
+                void* biasdevPtr,
+                void* in_meandevPtr,
+                void* in_vardevPtr,
+                void* out_meandevPtr,
+                void* out_vardevPtr,
+                void* saved_meandevPtr,
+                void* saved_inv_vardevPtr,
+                void* eq_scaledevPtr,
+                void* eq_biasdevPtr,
+
+                double epsilon_val,
+                double exponential_decay_factor,
+                int64_t accumCnt_val) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(perChannelSum, stride, 4, CUDNN_TENSOR_NHWC);
+
+        auto tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, perChannelSum)
+                .setStride(4, stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .build();
+        };
+
+        auto sumTensor         = tensor_create(CUDNN_DATA_FLOAT, 100);
+        auto sqSumTensor       = tensor_create(CUDNN_DATA_FLOAT, 101);
+        auto scaleTensor       = tensor_create(CUDNN_DATA_FLOAT, 102);
+        auto biasTensor        = tensor_create(CUDNN_DATA_FLOAT, 103);
+        auto inMeanTensor      = tensor_create(CUDNN_DATA_FLOAT, 104);
+        auto inVarTensor       = tensor_create(CUDNN_DATA_FLOAT, 105);
+        auto outMeanTensor     = tensor_create(CUDNN_DATA_FLOAT, 106);
+        auto outVarTensor      = tensor_create(CUDNN_DATA_FLOAT, 107);
+        auto savedMeanTensor   = tensor_create(CUDNN_DATA_FLOAT, 108);
+        auto savedInvVarTensor = tensor_create(CUDNN_DATA_FLOAT, 109);
+        auto outEqScaleTensor  = tensor_create(CUDNN_DATA_FLOAT, 200);
+        auto outEqBiasTensor   = tensor_create(CUDNN_DATA_FLOAT, 201);
+
+        int64_t epsilon_stride[4];
+        generateStrides(epsilon, epsilon_stride, 4, CUDNN_TENSOR_NHWC);
+        auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, epsilon)
+                .setStride(4, epsilon_stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .setByValue(true)
+                .build();
+        };
+
+        auto epsilonTensor    = scalar_tensor_create(CUDNN_DATA_DOUBLE, 300);
+        auto expDecayTensor   = scalar_tensor_create(CUDNN_DATA_DOUBLE, 301);
+        auto accumCountTensor = scalar_tensor_create(CUDNN_DATA_INT64, 302);
+
+        // Create a Finalize node
+        auto finalize_stat_op =
+            cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR)
+                .setComputeType(CUDNN_DATA_FLOAT)
+                .setBNFinalizeMode(CUDNN_BN_FINALIZE_STATISTICS_TRAINING)
+                .setSumDesc(sumTensor)
+                .setSqSumDesc(sqSumTensor)
+                .setScaleAndBias(scaleTensor, biasTensor)
+                .setEqScaleAndBias(outEqScaleTensor, outEqBiasTensor)
+                .setPrevRunningMeanAndVar(inMeanTensor, inVarTensor)
+                .setNextRunningMeanAndVar(outMeanTensor, outVarTensor)
+                .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
+                .setEpsilonTensor(epsilonTensor)
+                .setAccumCountTensor(accumCountTensor)
+                .setExpDecayFactorTensor(expDecayTensor)
+                .build();
+
+        std::array<cudnn_frontend::Operation const*, 1> ops = {&finalize_stat_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan_builder = [&filtered_configs, &opGraph, &handle_]() {
+            for (size_t i = 0; i < filtered_configs.size(); i++) {
+                try {
+                    auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                    .setHandle(handle_)
+                                    .setEngineConfig(filtered_configs[i], opGraph.getTag())
+                                    .build();
+                    return plan;
+                } catch (cudnn_frontend::cudnnException&) {
+                    continue;
+                }
+            }
+            return cudnn_frontend::ExecutionPlanBuilder()
+                .setHandle(handle_)
+                .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                .build();
+        };
+
+        REQUIRE(filtered_configs.size() > 0);
+        auto plan = plan_builder();
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        void* data_ptrs[15] = {YSumdevPtr,
+                               YSqSumdevPtr,
+                               scaledevPtr,
+                               biasdevPtr,
+                               in_meandevPtr,
+                               in_vardevPtr,
+                               out_meandevPtr,
+                               out_vardevPtr,
+                               saved_meandevPtr,
+                               saved_inv_vardevPtr,
+                               eq_scaledevPtr,
+                               eq_biasdevPtr,
+                               &epsilon_val,
+                               &exponential_decay_factor,
+                               &accumCnt_val};
+        int64_t uids[15]    = {100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 200, 201, 300, 301, 302};
+        auto variantPack    = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(15, data_ptrs)
+                               .setUids(15, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+        std::cout << "BN Finalize run completed successfully" << std::endl;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+#if (CUDNN_VERSION >= 8400)
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+#endif
+    }
+}
+
+cudnnStatus_t
+run_dsbar(int64_t* Y_dim,
+          int64_t* scaleTensorDim,
+          void* RP_YdevPtr,
+          void* RP_scaleDevPtr,
+          void* RP_biasDevPtr,
+          void* DP_YdevPtr,
+          void* DP_scaleDevPtr,
+          void* DP_biasDevPtr,
+          void* YdevPtr,
+          cudnnDataType_t op_data_type) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Create tensor descriptors
+        int64_t stride[4];
+
+        // RP_Y tensor
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto RP_yTensor = cudnn_frontend::TensorBuilder()
+                              .setDim(4, Y_dim)
+                              .setStride(4, stride)
+                              .setId('y')
+                              .setAlignment(16)  // 16 byte alignment
+                              .setDataType(CUDNN_DATA_HALF)
+                              .build();
+
+        // RP_scale tensor
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto RP_scaleTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, scaleTensorDim)
+                                  .setStride(4, stride)
+                                  .setId('s')
+                                  .setAlignment(16)  // 16 byte alignment
+                                  .setDataType(CUDNN_DATA_FLOAT)
+                                  .build();
+
+        // After RP scale tensor (RP_yTensor * RP_scaleTensor)
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto RP_afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                       .setDim(4, Y_dim)
+                                       .setStride(4, stride)
+                                       .setId('d')
+                                       .setVirtual()
+                                       .setAlignment(16)  // 16 byte alignment
+                                       .setDataType(CUDNN_DATA_FLOAT)
+                                       .build();
+
+        // RP_bias tensor
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto RP_biasTensor = cudnn_frontend::TensorBuilder()
+                                 .setDim(4, scaleTensorDim)
+                                 .setStride(4, stride)
+                                 .setId('b')
+                                 .setAlignment(16)  // 16 byte alignment
+                                 .setDataType(CUDNN_DATA_FLOAT)
+                                 .build();
+
+        // After RP bias tensor (RP_afterScaleTensor + RP_biasTensor)
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto RP_afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                      .setDim(4, Y_dim)
+                                      .setStride(4, stride)
+                                      .setId('e')
+                                      .setVirtual()
+                                      .setAlignment(16)  // 16 byte alignment
+                                      .setDataType(CUDNN_DATA_FLOAT)
+                                      .build();
+
+        // DP_Y tensor
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto DP_yTensor = cudnn_frontend::TensorBuilder()
+                              .setDim(4, Y_dim)
+                              .setStride(4, stride)
+                              .setId('a')
+                              .setAlignment(16)  // 16 byte alignment
+                              .setDataType(CUDNN_DATA_HALF)
+                              .build();
+
+        // DP_scale tensor
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto DP_scaleTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, scaleTensorDim)
+                                  .setStride(4, stride)
+                                  .setId('h')
+                                  .setAlignment(16)  // 16 byte alignment
+                                  .setDataType(CUDNN_DATA_FLOAT)
+                                  .build();
+
+        // After DP scale tensor (DP_yTensor * DP_scaleTensor)
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto DP_afterScaleTensor = cudnn_frontend::TensorBuilder()
+                                       .setDim(4, Y_dim)
+                                       .setStride(4, stride)
+                                       .setId('p')
+                                       .setVirtual()
+                                       .setAlignment(16)  // 16 byte alignment
+                                       .setDataType(CUDNN_DATA_FLOAT)
+                                       .build();
+
+        // DP_bias tensor
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto DP_biasTensor = cudnn_frontend::TensorBuilder()
+                                 .setDim(4, scaleTensorDim)
+                                 .setStride(4, stride)
+                                 .setId('t')
+                                 .setAlignment(16)  // 16 byte alignment
+                                 .setDataType(CUDNN_DATA_FLOAT)
+                                 .build();
+
+        // After DP bias tensor (DP_afterScaleTensor + DP_biasTensor)
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto DP_afterBiasTensor = cudnn_frontend::TensorBuilder()
+                                      .setDim(4, Y_dim)
+                                      .setStride(4, stride)
+                                      .setId('n')
+                                      .setVirtual()
+                                      .setAlignment(16)  // 16 byte alignment
+                                      .setDataType(CUDNN_DATA_FLOAT)
+                                      .build();
+
+        // After add RP_bias and DP_bias tensor (RP_afterBiasTensor + DP_afterBiasTensor)
+        generateStrides(Y_dim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterAddTensor = cudnn_frontend::TensorBuilder()
+                                  .setDim(4, Y_dim)
+                                  .setStride(4, stride)
+                                  .setId('m')
+                                  .setVirtual()
+                                  .setAlignment(16)  // 16 byte alignment
+                                  .setDataType(CUDNN_DATA_FLOAT)
+                                  .build();
+
+        // Final output tensor after ReLU
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, Y_dim)
+                           .setStride(4, stride)
+                           .setId('f')
+                           .setAlignment(16)  // 16 byte alignment
+                           .setDataType(op_data_type)
+                           .build();
+
+        std::cout << RP_yTensor.describe() << std::endl;
+        std::cout << DP_yTensor.describe() << std::endl;
+
+        // Create the scale, add, and relu problems
+        // Scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        // Bias (add) descriptor
+        auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_ADD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << addDesc.describe() << std::endl;
+
+        // ReLU descriptor
+        auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+                           .setMode(CUDNN_POINTWISE_RELU_FWD)
+                           .setComputeType(CUDNN_DATA_FLOAT)
+                           .build();
+        std::cout << actDesc.describe() << std::endl;
+        std::cout << "Creating Operations now!" << std::endl;
+
+        // Create RP scaling operation
+        auto RP_scaleOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                              .setxDesc(RP_yTensor)
+                              .setbDesc(RP_scaleTensor)
+                              .setyDesc(RP_afterScaleTensor)
+                              .setpwDesc(scaleDesc)
+                              .build();
+        std::cout << RP_scaleOp.describe() << std::endl;
+
+        // Create RP bias operation
+        auto RP_biasOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(RP_afterScaleTensor)
+                             .setbDesc(RP_biasTensor)
+                             .setyDesc(RP_afterBiasTensor)
+                             .setpwDesc(addDesc)
+                             .build();
+        std::cout << RP_biasOp.describe() << std::endl;
+
+        // Create DP scaling operation
+        auto DP_scaleOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                              .setxDesc(DP_yTensor)
+                              .setbDesc(DP_scaleTensor)
+                              .setyDesc(DP_afterScaleTensor)
+                              .setpwDesc(scaleDesc)
+                              .build();
+        std::cout << DP_scaleOp.describe() << std::endl;
+
+        // Create DP bias operation
+        auto DP_biasOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(DP_afterScaleTensor)
+                             .setbDesc(DP_biasTensor)
+                             .setyDesc(DP_afterBiasTensor)
+                             .setpwDesc(addDesc)
+                             .build();
+        std::cout << DP_biasOp.describe() << std::endl;
+
+        // Create add operation
+        auto addOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                         .setxDesc(RP_afterBiasTensor)
+                         .setbDesc(DP_afterBiasTensor)
+                         .setyDesc(afterAddTensor)
+                         .setpwDesc(addDesc)
+                         .build();
+        std::cout << addOp.describe() << std::endl;
+
+        // Create ReLU operation
+        auto actOp = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                         .setxDesc(afterAddTensor)
+                         .setyDesc(yTensor)
+                         .setpwDesc(actDesc)
+                         .build();
+        std::cout << actOp.describe() << std::endl;
+        std::cout << "Creating operation graph now!" << std::endl;
+
+        // Create an Operation Graph. In this case it is:
+        // RP_scaleOp -> RP_biasOp -> DP_scaleOp -> DP_biasOp -> addOp -> reluOp
+        std::array<cudnn_frontend::Operation const*, 6> ops = {
+            &RP_scaleOp, &RP_biasOp, &DP_scaleOp, &DP_biasOp, &addOp, &actOp};
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        // Create engine configuration
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        void* data_ptrs[] = {
+            RP_YdevPtr, DP_YdevPtr, RP_scaleDevPtr, DP_scaleDevPtr, RP_biasDevPtr, DP_biasDevPtr, YdevPtr};
+        int64_t uids[]   = {'y', 'a', 's', 'h', 'b', 't', 'f'};
+        auto variantPack = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(7, data_ptrs)
+                               .setUids(7, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+        return status;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with float inputs is only supported on Ampere or later" << std::endl;
+            return e.getCudnnStatus();
+        }
+#if (CUDNN_VERSION >= 8300)
+        std::cout << "[ERROR] Exception " << e.what() << std::endl;
+        CHECK(false);
+#endif
+        return e.getCudnnStatus();
+    }
+}
+
+cudnnStatus_t
+run_conv_two_global_scales(int64_t* xTensorDim,
+                           int64_t* wTensorDim,
+                           int64_t* yTensorDim,
+                           int64_t* scaleTensorDim,
+                           int convDim,
+                           int64_t* conv_padA,
+                           int64_t* conv_dilationA,
+                           int64_t* conv_strideA,
+                           void* devPtrX,
+                           void* devPtrW,
+                           void* devPtrScale1,
+                           void* devPtrScale2,
+                           void* devPtrOutput,
+                           void* afterConv) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+        if (check_device_arch_newer_than("ampere") == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr, CUDNN_STATUS_ARCH_MISMATCH, "run_conv_two_global_scales: Sample requires Ampere or above GPU");
+        }
+        // Creates the necessary tensor descriptors
+        int64_t stride[4];
+        generateStrides(xTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, xTensorDim)
+                           .setStride(4, stride)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(CUDNN_DATA_HALF)
+                           .build();
+
+        generateStrides(scaleTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto scale1Tensor = cudnn_frontend::TensorBuilder()
+                                .setDim(4, scaleTensorDim)
+                                .setStride(4, stride)
+                                .setId('s')
+                                .setAlignment(16)
+                                .setDataType(CUDNN_DATA_FLOAT)
+                                .build();
+
+        auto scale2Tensor = cudnn_frontend::TensorBuilder().cloneFrom(scale1Tensor, 'b').build();
+
+        generateStrides(wTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto wTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, wTensorDim)
+                           .setStride(4, stride)
+                           .setId('w')
+                           .setAlignment(16)
+                           .setDataType(CUDNN_DATA_HALF)
+                           .build();
+
+        generateStrides(yTensorDim, stride, 4, CUDNN_TENSOR_NHWC);
+        auto afterConvTensor = cudnn_frontend::TensorBuilder()
+                                   .setDim(4, yTensorDim)
+                                   .setStride(4, stride)
+                                   .setId('a')  // after conv
+                                   .setAlignment(16)
+                                   .setDataType(CUDNN_DATA_HALF)
+                                   .build();
+
+        auto afterScale1Tensor = cudnn_frontend::TensorBuilder().cloneFrom(afterConvTensor, 'v').setVirtual().build();
+
+        auto finalOutputTensor =
+            cudnn_frontend::TensorBuilder().cloneFrom(afterConvTensor, 'y').setVirtual(false).build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << wTensor.describe() << std::endl;
+        std::cout << finalOutputTensor.describe() << std::endl;
+
+        // Define the convolution problem
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, conv_strideA)
+                            .setPrePadding(convDim, conv_padA)
+                            .setPostPadding(convDim, conv_padA)
+                            .setDilation(convDim, conv_dilationA)
+                            .build();
+        std::cout << convDesc.describe() << std::endl;
+
+        // Define the scale descriptor
+        auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+                             .setMode(CUDNN_POINTWISE_MUL)
+                             .setComputeType(CUDNN_DATA_FLOAT)
+                             .build();
+        std::cout << scaleDesc.describe() << std::endl;
+
+        std::cout << "Creating OPs " << std::endl;
+
+        // Create a Multiplication Node with scaling parameters.
+        auto scale1_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(afterConvTensor)
+                             .setbDesc(scale1Tensor)
+                             .setyDesc(afterScale1Tensor)
+                             .setpwDesc(scaleDesc)
+                             .build();
+        std::cout << scale1_op.describe() << std::endl;
+
+        auto scale2_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                             .setxDesc(afterScale1Tensor)
+                             .setbDesc(scale2Tensor)
+                             .setyDesc(finalOutputTensor)
+                             .setpwDesc(scaleDesc)
+                             .build();
+        std::cout << scale2_op.describe() << std::endl;
+
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // Create a convolution Node
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setwDesc(wTensor)
+                           .setyDesc(afterConvTensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(alpha)
+                           .setBeta(beta)
+                           .build();
+        std::cout << conv_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is scale bias Relu conv gen_stats
+        std::array<cudnn_frontend::Operation const*, 3> ops = {&conv_op, &scale1_op, &scale2_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        cudnn_frontend::ManagedOpaqueDescriptor plan_desc = nullptr;
+        int64_t workspace_size                            = 0;
+        for (auto& config : filtered_configs) {
+            try {
+                auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                .setHandle(handle_)
+                                .setEngineConfig(config, opGraph.getTag())
+                                .build();
+                std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+                workspace_size = plan.getWorkspaceSize();
+                std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+                plan_desc = plan.get_desc();
+            } catch (cudnn_frontend::cudnnException&) {
+                continue;
+            }
+        }
+        if (plan_desc == nullptr) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_NOT_SUPPORTED,
+                "run_conv_two_global_scales: No plan found to be implementing this operation graph");
+        }
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        void* data_ptrs[] = {devPtrX, devPtrW, devPtrScale1, devPtrScale2, devPtrOutput, afterConv};
+        int64_t uids[]    = {'x', 'w', 's', 'b', 'y', 'a'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(6, data_ptrs)
+                               .setUids(6, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status =
+            cudnnBackendExecute(handle_, plan_desc->get_backend_descriptor(), variantPack.get_raw_desc());
+
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        return status;
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED || e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH)) {
+            std::cout << "Fusion with float inputs is only supported on Ampere or later" << std::endl;
+            return e.getCudnnStatus();
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+#if (CUDNN_VERSION >= 8300)
+            CHECK(false);
+#endif
+            return e.getCudnnStatus();
+        }
+    }
+}
+
+#if (CUDNN_VERSION >= 8600)
+void
+run_maxpool_with_idx(int64_t* x_dim,
+                     int64_t* y_dim,
+                     int64_t* idx_dim,
+                     void* devPtrdX,
+                     void* devPtrdY,
+                     void* devPtrIdx,
+                     cudnnDataType_t tensorType,
+                     cudnnNanPropagation_t const nanOpt,
+                     cudnn_frontend::ResampleMode_t mode,
+                     cudnn_frontend::PaddingMode_t const padding_mode,
+                     int32_t nbSpatialDims,
+                     int64_t* windowDimA,
+                     int64_t* prePaddingA,
+                     int64_t* postPaddingA,
+                     int64_t* strideA) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t strideTensor[4];
+        generateStrides(x_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto xTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, x_dim)
+                           .setStrides(4, strideTensor)
+                           .setId('x')
+                           .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                           .setDataType(tensorType)
+                           .build();
+
+        generateStrides(y_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto yTensor = cudnn_frontend::TensorBuilder()
+                           .setDim(4, y_dim)
+                           .setStrides(4, strideTensor)
+                           .setId('y')  // after conv
+                           .setAlignment(16)
+                           .setDataType(tensorType)
+                           .build();
+
+        generateStrides(idx_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto idxTensor = cudnn_frontend::TensorBuilder()
+                             .setDim(4, idx_dim)
+                             .setStrides(4, strideTensor)
+                             .setId('i')
+                             .setAlignment(16)
+                             .setDataType(CUDNN_DATA_INT8)
+                             .build();
+
+        std::cout << xTensor.describe() << std::endl;
+        std::cout << yTensor.describe() << std::endl;
+        std::cout << idxTensor.describe() << std::endl;
+
+        // Define the resample descriptor
+        auto poolDesc = cudnn_frontend::ResampleDescBuilder_v8()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setNanPropagation(nanOpt)
+                            .setResampleMode(mode)
+                            .setPaddingMode(padding_mode)
+                            .setSpatialDim(nbSpatialDims, windowDimA)
+                            .setSpatialStride(nbSpatialDims, strideA)
+                            .setPrePadding(nbSpatialDims, prePaddingA)
+                            .setPostPadding(nbSpatialDims, postPaddingA)
+                            .build();
+        std::cout << "Initialized Pool Desc" << std::endl;
+        std::cout << poolDesc.describe() << std::endl;
+
+        // Create a maxpooling Resample Node with index tensor
+        auto pool_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR)
+                           .setxDesc(xTensor)
+                           .setyDesc(yTensor)
+                           .setidxDesc(idxTensor)
+                           .setResampleDesc(poolDesc)
+                           .build();
+        std::cout << pool_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 1> ops = {&pool_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // Create engine configuration
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        // Create the variant pack and associate with the data pointers
+        void* data_ptrs[] = {devPtrdX, devPtrdY, devPtrIdx};
+        int64_t uids[]    = {'x', 'y', 'i'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        // Trigger the execute operation
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        std::cout << "EXECUTE SUCCESS" << std::endl;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Ampere GPUs" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+#endif
+
+#if (CUDNN_VERSION >= 8600)
+void
+run_backward_avgpool(int64_t* dx_dim,
+                     int64_t* dy_dim,
+                     void* devPtrdX,
+                     void* devPtrdY,
+                     cudnnDataType_t tensorType,
+                     cudnnNanPropagation_t const nanOpt,
+                     cudnn_frontend::ResampleMode_t mode,
+                     cudnn_frontend::PaddingMode_t const padding_mode,
+                     int32_t nbSpatialDims,
+                     int64_t* windowDimA,
+                     int64_t* prePaddingA,
+                     int64_t* postPaddingA,
+                     int64_t* strideA) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t strideTensor[4];
+        generateStrides(dy_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto dyTensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, dy_dim)
+                            .setStrides(4, strideTensor)
+                            .setId('y')
+                            .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                            .setDataType(tensorType)
+                            .build();
+
+        generateStrides(dx_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto dxTensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, dx_dim)
+                            .setStrides(4, strideTensor)
+                            .setId('x')  // after conv
+                            .setAlignment(16)
+                            .setDataType(tensorType)
+                            .build();
+
+        std::cout << dyTensor.describe() << std::endl;
+        std::cout << dxTensor.describe() << std::endl;
+
+        // Define the resample descriptor
+        auto poolDesc = cudnn_frontend::ResampleDescBuilder_v8()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setNanPropagation(nanOpt)
+                            .setResampleMode(mode)
+                            .setPaddingMode(padding_mode)
+                            .setSpatialDim(nbSpatialDims, windowDimA)
+                            .setSpatialStride(nbSpatialDims, strideA)
+                            .setPrePadding(nbSpatialDims, prePaddingA)
+                            .setPostPadding(nbSpatialDims, postPaddingA)
+                            .build();
+        std::cout << "Initialized Pool Desc" << std::endl;
+        std::cout << poolDesc.describe() << std::endl;
+
+        // Create an average pooling Resample Node
+        auto pool_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR)
+                           .setdxDesc(dxTensor)
+                           .setdyDesc(dyTensor)
+                           .setResampleDesc(poolDesc)
+                           .build();
+        std::cout << pool_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 1> ops = {&pool_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // Create engine configuration
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        // Create the variant pack and associate with the data pointers
+        void* data_ptrs[] = {devPtrdX, devPtrdY};
+        int64_t uids[]    = {'x', 'y'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(2, data_ptrs)
+                               .setUids(2, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        // Trigger the execute operation
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        std::cout << "EXECUTE SUCCESS" << std::endl;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major < 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Ampere GPUs" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+#endif
+
+#if (CUDNN_VERSION >= 8600)
+void
+run_backward_maxpool(int64_t* dx_dim,
+                     int64_t* dy_dim,
+                     int64_t* idx_dim,
+                     void* devPtrdX,
+                     void* devPtrdY,
+                     void* devPtrIdx,
+                     cudnnDataType_t tensorType,
+                     cudnnNanPropagation_t const nanOpt,
+                     cudnn_frontend::ResampleMode_t mode,
+                     cudnn_frontend::PaddingMode_t const padding_mode,
+                     int32_t nbSpatialDims,
+                     int64_t* windowDimA,
+                     int64_t* prePaddingA,
+                     int64_t* postPaddingA,
+                     int64_t* strideA) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // Creates the necessary tensor descriptors
+        int64_t strideTensor[4];
+        generateStrides(dy_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto dyTensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, dy_dim)
+                            .setStrides(4, strideTensor)
+                            .setId('y')
+                            .setAlignment(16)  // 16B alignment is needed to run a tensor core engine
+                            .setDataType(tensorType)
+                            .build();
+
+        generateStrides(dx_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto dxTensor = cudnn_frontend::TensorBuilder()
+                            .setDim(4, dx_dim)
+                            .setStrides(4, strideTensor)
+                            .setId('x')  // after conv
+                            .setAlignment(16)
+                            .setDataType(tensorType)
+                            .build();
+
+        generateStrides(idx_dim, strideTensor, 4, CUDNN_TENSOR_NHWC);
+        auto idxTensor = cudnn_frontend::TensorBuilder()
+                             .setDim(4, idx_dim)
+                             .setStrides(4, strideTensor)
+                             .setId('i')
+                             .setAlignment(16)
+                             .setDataType(CUDNN_DATA_INT8)
+                             .build();
+
+        std::cout << dyTensor.describe() << std::endl;
+        std::cout << dxTensor.describe() << std::endl;
+
+        // Define the resample descriptor
+        auto poolDesc = cudnn_frontend::ResampleDescBuilder_v8()
+                            .setComputeType(CUDNN_DATA_FLOAT)
+                            .setSpatialDim(nbSpatialDims, windowDimA)
+                            .setNanPropagation(nanOpt)
+                            .setResampleMode(mode)
+                            .setPaddingMode(padding_mode)
+                            .setSpatialDim(nbSpatialDims, windowDimA)
+                            .setSpatialStride(nbSpatialDims, strideA)
+                            .setPrePadding(nbSpatialDims, prePaddingA)
+                            .setPostPadding(nbSpatialDims, postPaddingA)
+                            .build();
+        std::cout << "Initialized Pool Desc" << std::endl;
+        std::cout << poolDesc.describe() << std::endl;
+
+        // Create a maxpooling Resample Node with index tensor
+        auto pool_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR)
+                           .setdxDesc(dxTensor)
+                           .setdyDesc(dyTensor)
+                           .setidxDesc(idxTensor)
+                           .setResampleDesc(poolDesc)
+                           .build();
+        std::cout << pool_op.describe() << std::endl;
+
+        // Create an Operation Graph. In this case it is convolution bias scale activation
+        std::array<cudnn_frontend::Operation const*, 1> ops = {&pool_op};
+        auto opGraph                                        = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // Create engine configuration
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        // Create the variant pack and associate with the data pointers
+        void* data_ptrs[] = {devPtrdX, devPtrdY, devPtrIdx};
+        int64_t uids[]    = {'x', 'y', 'i'};
+        auto variantPack  = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(3, data_ptrs)
+                               .setUids(3, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+
+        // Trigger the execute operation
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+        std::cout << "EXECUTE SUCCESS" << std::endl;
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere cards
+        if (prop.major != 8 &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            std::cout << "Example is only supported for Ampere GPUs" << std::endl;
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+#endif
+
+#if (CUDNN_VERSION >= 8400)
+void
+run_bn_bwd_weight(int64_t* xDim,
+                  int64_t* dyDim,
+                  int64_t* wDim,
+                  int64_t* scaleDim,
+                  void* x_bn_fwd,
+                  void* w_fwd,
+                  void* dy,
+                  void* dy_bn,
+                  void* mean,
+                  void* inv_var,
+                  void* scale,
+                  void* bias,
+                  void* d_scale,
+                  void* d_bias,
+                  void* eqscale_dy,
+                  void* eqscale_x,
+                  void* eqbias) {
+    try {
+        // Create a unique_ptr for the cuDNN handle
+        auto handle_ptr = create_cudnn_handle();
+        auto handle_    = *handle_ptr;
+
+        // this example is only for Ampere and Hopper cards
+        bool is_supported = (is_ampere_arch() || is_hopper_arch());
+        if (is_supported == false) {
+            cudnn_frontend::set_error_and_throw_exception(
+                nullptr,
+                CUDNN_STATUS_ARCH_MISMATCH,
+                "run_conv_scale_bias_relu_gen_index_selection: Sample requires Ampere or above GPU");
+        }
+
+        cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+
+        // Creates the necessary tensor descriptors
+        int64_t xstrideTensor[4];
+        int64_t dystrideTensor[4];
+        int64_t wstrideTensor[4];
+        generateStrides(xDim, xstrideTensor, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(dyDim, dystrideTensor, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(wDim, wstrideTensor, 4, CUDNN_TENSOR_NHWC);
+
+        int64_t perChannelStride[4];
+        generateStrides(scaleDim, perChannelStride, 4, CUDNN_TENSOR_NHWC);
+
+        auto tensor_create = [](int64_t* stride, int64_t* dim, cudnnDataType_t type, int64_t id, bool is_virtual) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, dim)
+                .setStride(4, stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .setVirtual(is_virtual)
+                .build();
+        };
+
+        auto pointwise_create = [](cudnnPointwiseMode_t mode) {
+            return cudnn_frontend::PointWiseDescBuilder().setMode(mode).setComputeType(CUDNN_DATA_FLOAT).build();
+        };
+
+        auto pointwise_op_create = [](cudnn_frontend::Tensor& x,
+                                      cudnn_frontend::Tensor& s,
+                                      cudnn_frontend::Tensor& y,
+                                      cudnn_frontend::PointWiseDesc& pw) {
+            return cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                .setxDesc(x)
+                .setbDesc(s)
+                .setyDesc(y)
+                .setpwDesc(pw)
+                .build();
+        };
+
+        auto x_tensor_bn_fwd = tensor_create(xstrideTensor, xDim, CUDNN_DATA_HALF, 100, false);
+        auto w_tensor        = tensor_create(wstrideTensor, wDim, CUDNN_DATA_HALF, 101, false);
+        auto dy_tensor       = tensor_create(dystrideTensor, dyDim, CUDNN_DATA_HALF, 102, false);
+        auto dy_bn_tensor    = tensor_create(xstrideTensor, xDim, CUDNN_DATA_HALF, 103, false);
+
+        auto scaleTensor  = tensor_create(perChannelStride, scaleDim, computeType, 200, false);
+        auto biasTensor   = tensor_create(perChannelStride, scaleDim, computeType, 201, false);
+        auto meanTensor   = tensor_create(perChannelStride, scaleDim, computeType, 202, false);
+        auto invVarTensor = tensor_create(perChannelStride, scaleDim, computeType, 203, false);
+
+        auto d_scaleTensor    = tensor_create(perChannelStride, scaleDim, computeType, 300, false);
+        auto d_biasTensor     = tensor_create(perChannelStride, scaleDim, computeType, 301, false);
+        auto eqscale_dyTensor = tensor_create(perChannelStride, scaleDim, computeType, 302, false);
+        auto eqscale_xTensor  = tensor_create(perChannelStride, scaleDim, computeType, 303, false);
+        auto eqbiasTensor     = tensor_create(perChannelStride, scaleDim, computeType, 304, false);
+
+        auto after_scaleTensor  = tensor_create(xstrideTensor, xDim, computeType, 400, true);
+        auto after_biasTensor   = tensor_create(xstrideTensor, xDim, computeType, 401, true);
+        auto after_meanTensor   = tensor_create(xstrideTensor, xDim, computeType, 402, true);
+        auto after_invVarTensor = tensor_create(xstrideTensor, xDim, computeType, 403, true);
+
+        auto after_dgrad_tensor = tensor_create(xstrideTensor, xDim, CUDNN_DATA_HALF, 500, true);
+
+        // Define the pointwise descriptor
+        auto scaleDesc   = pointwise_create(CUDNN_POINTWISE_MUL);
+        auto biasDesc    = pointwise_create(CUDNN_POINTWISE_ADD);
+        auto subDesc     = pointwise_create(CUDNN_POINTWISE_SUB);
+        auto mulDesc     = pointwise_create(CUDNN_POINTWISE_MUL);
+        auto bwdReluDesc = pointwise_create(CUDNN_POINTWISE_RELU_BWD);
+
+        // Create Pointwise Operations
+        auto subOpDesc     = pointwise_op_create(x_tensor_bn_fwd, meanTensor, after_meanTensor, subDesc);
+        auto mulOpDesc     = pointwise_op_create(after_meanTensor, invVarTensor, after_invVarTensor, mulDesc);
+        auto scaleOpDesc   = pointwise_op_create(after_invVarTensor, scaleTensor, after_scaleTensor, scaleDesc);
+        auto biasOpDesc    = pointwise_op_create(after_scaleTensor, biasTensor, after_biasTensor, biasDesc);
+        auto bwdReluOpDesc = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                                 .setdyDesc(after_dgrad_tensor)
+                                 .setxDesc(after_biasTensor)
+                                 .setdxDesc(dy_bn_tensor)
+                                 .setpwDesc(bwdReluDesc)
+                                 .build();
+
+        // Create dgrad desc and operation
+        int64_t convDim      = 2;
+        int64_t padding[]    = {1, 1};
+        int64_t dilation[]   = {1, 1};
+        int64_t convstride[] = {1, 1};
+
+        auto convDesc = cudnn_frontend::ConvDescBuilder()
+                            .setComputeType(computeType)
+                            .setMathMode(CUDNN_CROSS_CORRELATION)
+                            .setSpatialDimCount(convDim)
+                            .setSpatialStride(convDim, convstride)
+                            .setPrePadding(convDim, padding)
+                            .setPostPadding(convDim, padding)
+                            .setDilation(convDim, dilation)
+                            .build();
+
+        auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
+                           .setdyDesc(dy_tensor)
+                           .setwDesc(w_tensor)
+                           .setdxDesc(after_dgrad_tensor)
+                           .setcDesc(convDesc)
+                           .setAlpha(1.0f)
+                           .setBeta(0.0f)
+                           .build();
+
+        auto bn_bwd_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR)
+                             .setComputeType(computeType)
+                             .setxDesc(x_tensor_bn_fwd)
+                             .setSavedMeanAndInvVar(meanTensor, invVarTensor)
+                             .setScale(scaleTensor)
+                             .setdyDesc(dy_bn_tensor)
+                             .setEqScalesAndBias(eqscale_dyTensor, eqscale_xTensor, eqbiasTensor)
+                             .setDScaleAndDBias(d_scaleTensor, d_biasTensor)
+                             .build();
+
+        // Create an Operation Graph. In this case it is convolution scale bias add activation
+        std::array<cudnn_frontend::Operation const*, 7> ops = {
+            &conv_op, &subOpDesc, &mulOpDesc, &scaleOpDesc, &biasOpDesc, &bwdReluOpDesc, &bn_bwd_op};
+
+        auto opGraph = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+
+        // Create engine configuration
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::allowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (size_t i = 0; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                        .setHandle(handle_)
+                        .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                        .build();
+
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void* workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, (size_t)workspace_size));
+        }
+
+        void* data_ptrs[] = {
+            x_bn_fwd, w_fwd, dy, dy_bn, scale, bias, mean, inv_var, d_scale, d_bias, eqscale_dy, eqscale_x, eqbias};
+        int64_t uids[]   = {100, 101, 102, 103, 200, 201, 202, 203, 300, 301, 302, 303, 304};
+        auto variantPack = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(13, data_ptrs)
+                               .setUids(13, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+    } catch (cudnn_frontend::cudnnException& e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+
+        // this example is only for Ampere and Hopper cards
+        bool is_supported_on_ampere = is_ampere_arch();
+        bool is_supported_on_hopper = is_hopper_arch() && (cudnnGetVersion() >= 8900);
+        if (((!is_supported_on_hopper) && (!is_supported_on_ampere)) &&
+            (e.getCudnnStatus() == CUDNN_STATUS_ARCH_MISMATCH || e.getCudnnStatus() == CUDNN_STATUS_NOT_SUPPORTED)) {
+            SKIP("Example is only supported for Ampere and Hopper GPUs");
+        } else {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+#endif
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/fusion_sample.h b/third_party/cudnn-frontend/samples/legacy_samples/fusion_sample.h
new file mode 100644
index 00000000..83309a9f
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/fusion_sample.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn.h>
+#include "./utils/fp16_dev.h"
+#include "./utils/fp16_emu.h"
+#include "./utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+void
+run_conv_scale_bias_add_leaky_relu(int64_t* x_dim,
+                                   int64_t* w_dim,
+                                   int64_t* y_dim,
+                                   int64_t* s_dim,
+                                   int64_t* b_dim,
+                                   int64_t* a_dim,
+                                   cudnnDataType_t dataType,
+                                   int convDim,
+                                   int64_t* conv_padA,
+                                   int64_t* conv_dilationA,
+                                   int64_t* conv_strideA,
+                                   void* devPtrX,
+                                   void* devPtrW,
+                                   void* devPtrY,
+                                   void* devPtrS,
+                                   void* devPtrB,
+                                   void* devPtrA);
+
+void
+run_conv_bias_scale_relu(int64_t* x_dim,
+                         int64_t* w_dim,
+                         int64_t* y_dim,
+                         int64_t* b_dim,
+                         int64_t* s_dim,
+                         cudnnDataType_t dataType,
+                         int convDim,
+                         int64_t* conv_padA,
+                         int64_t* conv_dilationA,
+                         int64_t* conv_strideA,
+                         void* devPtrX,
+                         void* devPtrW,
+                         void* devPtrY,
+                         void* devPtrB,
+                         void* devPtrS);
+
+void
+run_serialization_conv_bias_scale_relu(int64_t* x_dim,
+                                       int64_t* w_dim,
+                                       int64_t* y_dim,
+                                       int64_t* b_dim,
+                                       int64_t* s_dim,
+                                       cudnnDataType_t dataType,
+                                       int convDim,
+                                       int64_t* conv_padA,
+                                       int64_t* conv_dilationA,
+                                       int64_t* conv_strideA,
+                                       void* devPtrX,
+                                       void* devPtrW,
+                                       void* devPtrY,
+                                       void* devPtrB,
+                                       void* devPtrS);
+
+void
+run_conv_scale_bias_relu_gen_index_selection(int64_t* x_dim,
+                                             int64_t* w_dim,
+                                             int64_t* y_dim,
+                                             int64_t* s_dim,
+                                             int64_t* b_dim,
+                                             int64_t* threshold_dim,
+                                             cudnnDataType_t dataType,
+                                             int convDim,
+                                             int64_t* conv_padA,
+                                             int64_t* conv_dilationA,
+                                             int64_t* conv_strideA,
+                                             int axis,
+                                             void* devPtrX,
+                                             void* devPtrW,
+                                             void* devPtrY,
+                                             void* devPtrS,
+                                             void* devPtrB,
+                                             void* devPtrTopThreshold,
+                                             void* devPtrBottomThreshold);
+
+void
+run_conv_scale_bias_relu_int8(int64_t* x_dim,
+                              int64_t* w_dim,
+                              int64_t* y_dim,
+                              int64_t* s_dim,
+                              int64_t* b_dim,
+                              int convDim,
+                              int64_t* conv_padA,
+                              int64_t* conv_dilationA,
+                              int64_t* conv_strideA,
+                              void* devPtrX,
+                              void* devPtrW,
+                              void* devPtrY,
+                              void* devPtrS,
+                              void* devPtrB);
+
+void
+run_pool_scale_bias_relu_int8(int64_t* x_dim,
+                              int64_t* y_dim,
+                              int64_t* s_dim,
+                              int64_t* b_dim,
+                              void* devPtrX,
+                              void* devPtrY,
+                              void* devPtrS,
+                              void* devPtrB,
+                              cudnnDataType_t compType,
+                              cudnnNanPropagation_t const nanOpt,
+                              cudnn_frontend::ResampleMode_t const resample_mode,
+                              cudnn_frontend::PaddingMode_t const padding_mode,
+                              int64_t nbSpatialDims,
+                              double alpha,
+                              double beta,
+                              int64_t* windowDimA,
+                              int64_t* prePaddingA,
+                              int64_t* postPaddingA,
+                              int64_t* strideA);
+
+void
+run_matmul_bias_gelu(int64_t* a_dim,
+                     int64_t* b_dim,
+                     int64_t* c_dim,
+                     int64_t* z_dim,
+                     cudnnDataType_t dataType,
+                     void* devPtrA,
+                     void* devPtrB,
+                     void* devPtrC,
+                     void* devPtrZ,
+                     void* devPtrAfterZ);
+
+void
+run_matmul_dgelu_dbias(const int64_t* a_dim,
+                       const int64_t* b_dim,
+                       const int64_t* c_dim,
+                       const int64_t* bias_dim,
+                       cudnnDataType_t dataType,
+                       void* devPtrDy,
+                       void* devPtrW,
+                       void* devPtrX,
+                       void* devPtrDX,
+                       void* devPtrDBias);
+
+void
+run_conv_drelu(int64_t* x,
+               int64_t* pad,
+               int64_t* convstride,
+               int64_t* dilation,
+               int64_t* w,
+               int64_t* y,
+               cudnnDataType_t dataType,
+               void* devPtrX,
+               void* devPtrW,
+               void* devPtrY,
+               void* devPtrExtra_X);
+
+void
+run_dgrad_drelu(int64_t* x,
+                int64_t* pad,
+                int64_t* convstride,
+                int64_t* dilation,
+                int64_t* w,
+                int64_t* y,
+                cudnnDataType_t dataType,
+                void* devPtrX,
+                void* devPtrW,
+                void* devPtrY,
+                void* devPtrExtra_X);
+
+void
+run_conv_reduction(int64_t* x_dim,
+                   int64_t* w_dim,
+                   int64_t* y_dim,
+                   int64_t* r_dim,
+                   cudnnDataType_t dataType,
+                   int convDim,
+                   int64_t* conv_padA,
+                   int64_t* conv_dilationA,
+                   int64_t* conv_strideA,
+                   void* devPtrX,
+                   void* devPtrW,
+                   void* devPtrR);
+
+cudnnStatus_t
+run_bn_conv_gen_stat(int64_t* xTensorDim,
+                     int64_t* wTensorDim,
+                     int64_t* yTensorDim,
+                     int64_t* scaleTensorDim,
+                     int convdim,
+                     int64_t* conv_padA,
+                     int64_t* conv_dilationA,
+                     int64_t* conv_strideA,
+                     void* XdevPtr,
+                     void* WdevPtr,
+                     void* YdevPtr,
+                     void* scaledevPtr,
+                     void* biasdevPtr,
+                     void* sumdevPtr,
+                     void* sqSumdevPtr);
+
+void
+run_bn_finalize(int64_t* perChannelSum,
+                int64_t* epsilon,
+
+                void* YSumdevPtr,
+                void* YSqSumdevPtr,
+                void* scaledevPtr,
+                void* biasdevPtr,
+                void* in_meandevPtr,
+                void* in_vardevPtr,
+                void* out_meandevPtr,
+                void* out_vardevPtr,
+                void* saved_meandevPtr,
+                void* saved_inv_vardevPtr,
+                void* eq_scaledevPtr,
+                void* eq_biasdevPtr,
+
+                double epsilon_val,
+                double exponential_decay_factor,
+                int64_t accumCnt_val);
+
+cudnnStatus_t
+run_dsbar(int64_t* Y_dim,
+          int64_t* scaleTensorDim,
+          void* RP_YdevPtr,
+          void* RP_scaleDevPtr,
+          void* RP_biasDevPtr,
+          void* DP_YdevPtr,
+          void* DP_scaleDevPtr,
+          void* DP_biasDevPtr,
+          void* YdevPtr,
+          cudnnDataType_t op_data_type);
+
+#if (CUDNN_VERSION >= 8600)
+void
+run_maxpool_with_idx(int64_t* x_dim,
+                     int64_t* y_dim,
+                     int64_t* idx_dim,
+                     void* devPtrdX,
+                     void* devPtrdY,
+                     void* devPtrIdx,
+                     cudnnDataType_t tensorType,
+                     cudnnNanPropagation_t const nanOpt,
+                     cudnn_frontend::ResampleMode_t const resample_mode,
+                     cudnn_frontend::PaddingMode_t const padding_mode,
+                     int32_t nbSpatialDims,
+                     int64_t* windowDimA,
+                     int64_t* prePaddingA,
+                     int64_t* postPaddingA,
+                     int64_t* strideA);
+#endif
+
+cudnnStatus_t
+run_conv_two_global_scales(int64_t* xTensorDim,
+                           int64_t* wTensorDim,
+                           int64_t* yTensorDim,
+                           int64_t* scaleTensorDim,
+                           int convDim,
+                           int64_t* conv_padA,
+                           int64_t* conv_dilationA,
+                           int64_t* conv_strideA,
+                           void* devPtrX,
+                           void* devPtrW,
+                           void* devPtrScale1,
+                           void* devPtrScale2,
+                           void* devPtrOutput,
+                           void* afterConv);
+
+#if (CUDNN_VERSION >= 8600)
+void
+run_backward_avgpool(int64_t* dx_dim,
+                     int64_t* dy_dim,
+                     void* devPtrdX,
+                     void* devPtrdY,
+                     cudnnDataType_t tensorType,
+                     cudnnNanPropagation_t const nanOpt,
+                     cudnn_frontend::ResampleMode_t const resample_mode,
+                     cudnn_frontend::PaddingMode_t const padding_mode,
+                     int32_t nbSpatialDims,
+                     int64_t* windowDimA,
+                     int64_t* prePaddingA,
+                     int64_t* postPaddingA,
+                     int64_t* strideA);
+#endif
+
+#if (CUDNN_VERSION >= 8400)
+void
+run_backward_maxpool(int64_t* dx_dim,
+                     int64_t* dy_dim,
+                     int64_t* idx_dim,
+                     void* devPtrdX,
+                     void* devPtrdY,
+                     void* devPtrIdx,
+                     cudnnDataType_t tensorType,
+                     cudnnNanPropagation_t const nanOpt,
+                     cudnn_frontend::ResampleMode_t const resample_mode,
+                     cudnn_frontend::PaddingMode_t const padding_mode,
+                     int32_t nbSpatialDims,
+                     int64_t* windowDimA,
+                     int64_t* prePaddingA,
+                     int64_t* postPaddingA,
+                     int64_t* strideA);
+#endif
+
+#if (CUDNN_VERSION >= 8400)
+void
+run_bn_bwd_weight(int64_t* xDim,
+                  int64_t* dyDim,
+                  int64_t* wDim,
+                  int64_t* scaleDim,
+                  void* x_bn_fwd,
+                  void* w_fwd,
+                  void* dy,
+                  void* dy_bn,
+                  void* mean,
+                  void* inv_var,
+                  void* scale,
+                  void* bias,
+                  void* d_scale,
+                  void* d_bias,
+                  void* A,
+                  void* B,
+                  void* c);
+#endif
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/helpers.cpp b/third_party/cudnn-frontend/samples/legacy_samples/helpers.cpp
new file mode 100644
index 00000000..daca8f41
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/helpers.cpp
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "./utils/helpers.h"
+
+size_t
+get_compute_capability() {
+    struct cudaDeviceProp prop;
+    checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+    return prop.major * 10 + prop.minor;
+}
+
+bool
+is_ampere_arch() {
+    auto cc = get_compute_capability();
+    return (80 <= cc) && (cc < 89);
+}
+
+bool
+is_ada_arch() {
+    auto cc = get_compute_capability();
+    return (cc == 89);
+}
+
+bool
+is_hopper_arch() {
+    auto cc = get_compute_capability();
+    return (90 <= cc && cc < 100);
+}
+
+bool
+is_blackwell_computing_arch() {
+    auto cc = get_compute_capability();
+    return (100 <= cc && cc < 110);
+}
+
+bool
+is_blackwell_gaming_arch() {
+    auto cc = get_compute_capability();
+    return (120 <= cc && cc < 130);
+}
+
+bool
+is_arch_supported_by_cudnn() {
+    if (cudnnGetVersion() < 8600 && (is_hopper_arch() || is_ada_arch())) {
+        return false;
+    }
+    return true;
+}
+
+bool
+check_device_arch_newer_than(std::string const& arch) {
+    size_t arch_major = 6;
+    size_t arch_minor = 0;
+    if (arch == "blackwell") {
+        arch_major = 10;
+    }
+    if (arch == "hopper") {
+        arch_major = 9;
+    }
+    if (arch == "ampere") {
+        arch_major = 8;
+    }
+    if (arch == "turing") {
+        arch_major = 7;
+        arch_minor = 5;
+    }
+    if (arch == "volta") {
+        arch_major = 7;
+    }
+    if (arch == "pascal") {
+        arch_major = 6;
+    }
+
+    auto queried_version = arch_major * 10 + arch_minor;
+    if (get_compute_capability() >= queried_version) {
+        return true;
+    }
+    return false;
+}
+
+// Generate uniform numbers [0,1)
+void
+initImage(float* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed         = (1103515245 * seed + 12345) & 0xffffffff;
+        image[index] = float(seed) * 2.3283064e-10f;  // 2^-32
+    }
+}
+
+void
+testinitImage(half1* image, int64_t imageSize, int test) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // image[index] = cpu_float2half_rn(float(seed) * 2.3283064e-10f);  // 2^-32
+        if (test)
+            image[index] = cpu_float2half_rn(static_cast<float>((index + 1) * 2));  // 2^-32
+        else
+            image[index] = cpu_float2half_rn(static_cast<float>(index + 1));  // 2^-32
+    }
+}
+
+void
+initImage(half1* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed         = (1103515245 * seed + 12345) & 0xffffffff;
+        image[index] = cpu_float2half_rn(float(seed) * 2.3283064e-10f);  // 2^-32
+    }
+}
+
+// Currently set to generate uniform integers [-2, 2] to avoid int8 overflow
+void
+initImage(int8_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then subtracts from 2
+        image[index] = 2 - (int8_t)(5 * float(seed) * 2.3283064e-10f);  // 2^-32
+    }
+}
+
+// Currently set to generate random integers [0, 50] to avoid uint8 overflow
+void
+initImage(uint8_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 50]
+        image[index] = (uint8_t)(50 * float(seed) * 2.3283064e-10f);  // 2^-32
+    }
+}
+
+// Currently set to generate uniform integers [0,1]
+void
+initImage(int32_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then divides by 4
+        image[index] = ((int32_t)(5.f * float(seed) * 2.3283064e-10f)) / 4;  // 2^-32
+    }
+}
+
+// Currently set to generate uniform integers [0,1]
+void
+initImage(int64_t* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then divides by 4
+        image[index] = ((int64_t)(5.f * float(seed) * 2.3283064e-10f)) / 4;  // 2^-32
+    }
+}
+
+// Currently set to generate booleans
+void
+initImage(bool* image, int64_t imageSize) {
+    static unsigned seed = 123456789;
+    for (int64_t index = 0; index < imageSize; index++) {
+        seed = (1103515245 * seed + 12345) & 0xffffffff;
+        // Takes floats from [0, 1), scales and casts to ints from [0, 4], then divides by 4
+        int64_t val = ((int32_t)(5.f * float(seed) * 2.3283064e-10f)) / 4;  // 2^-32
+
+        // val is 0 or 1
+        image[index] = (val == 1);
+    }
+}
+
+void
+initImagePadded(int8_t* image, int64_t dimA[], int64_t dimPadded[], int64_t stridePadded[], cudnnDataType_t dataType) {
+    static unsigned seed = 123456789;
+    int64_t resizeFactor = (dataType == CUDNN_DATA_INT8x4) ? 4 : 32;
+    int64_t totalSize    = dimPadded[0] * dimPadded[1] * dimPadded[2] * dimPadded[3];
+
+    // #pragma omp parallel for
+    for (int64_t i = 0; i < totalSize; i++) {
+        int64_t n  = (i / stridePadded[0]) % dimPadded[0];
+        int64_t c1 = (i / (stridePadded[1] * resizeFactor)) % (dimPadded[1] / resizeFactor);
+        int64_t c2 = i % resizeFactor;
+        int64_t c  = c1 * resizeFactor + c2;
+        if (n < dimA[0] && c < dimA[1]) {
+            image[i] = 2 - (int8_t)(5 * float(seed) * 2.3283064e-10);  // 2^-32
+        } else {
+            image[i] = 0;
+        }
+    }
+}
+
+int64_t
+checkCudaError(cudaError_t code, const char* expr, const char* file, int line) {
+    if (code) {
+        printf("CUDA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code, cudaGetErrorString(code), expr);
+        return 1;
+    }
+    return 0;
+}
+
+int64_t
+checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
+    if (code) {
+        printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
+        return 1;
+    }
+    return 0;
+}
+
+void
+generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
+    // For INT8x4 and INT8x32 we still compute standard strides here to input
+    // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
+    if (filterFormat == CUDNN_TENSOR_NCHW) {
+        strideA[nbDims - 1] = 1;
+        for (int64_t d = nbDims - 2; d >= 0; d--) {
+            strideA[d] = strideA[d + 1] * dimA[d + 1];
+        }
+    } else {
+        // Here we assume that the format is CUDNN_TENSOR_NHWC
+        strideA[1]          = 1;
+        strideA[nbDims - 1] = strideA[1] * dimA[1];
+        for (int64_t d = nbDims - 2; d >= 2; d--) {
+            strideA[d] = strideA[d + 1] * dimA[d + 1];
+        }
+        strideA[0] = strideA[2] * dimA[2];
+    }
+}
+
+// Used for MHA
+void
+generateMHAStrides(int64_t b,
+                   int64_t h,
+                   int64_t s_q,
+                   int64_t s_kv,
+                   int64_t d,
+                   int64_t* strideA,
+                   MHA_Layout layout,
+                   MHA_Matrix matrix) {
+    CUDNN_FRONTEND_UNUSED(b);
+    constexpr int batch_dim_idx  = 0;
+    constexpr int head_dim_idx   = 1;
+    constexpr int seqlen_dim_idx = 2;
+    constexpr int hidden_dim_idx = 3;
+
+    constexpr int seqlen_transpose_dim_idx = 3;
+    constexpr int hidden_transpose_dim_idx = 2;
+
+    constexpr int seqlen_q_dim_idx  = 2;
+    constexpr int seqlen_kv_dim_idx = 3;
+
+    switch (matrix) {
+        case MHA_Matrix::Q_Matrix:
+            if (layout == MHA_Layout::QKV_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 3 * h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_q * 3 * h * d;
+
+            } else if (layout == MHA_Layout::SBH_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 3 * h * d * b;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = 3 * d;
+                strideA[batch_dim_idx]  = 3 * h * d;
+            } else {
+                strideA[seqlen_dim_idx] = h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_q * h * d;
+            }
+            break;
+        case MHA_Matrix::K_Matrix:
+            if (layout == MHA_Layout::QKV_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 3 * h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_kv * 3 * h * d;
+            } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 2 * h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_kv * 2 * h * d;
+            } else if (layout == MHA_Layout::SBH_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 3 * h * d * b;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = 3 * d;
+                strideA[batch_dim_idx]  = 3 * h * d;
+            } else {
+                strideA[seqlen_dim_idx] = h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_kv * h * d;
+            }
+            break;
+        case MHA_Matrix::K_Matrix_Transpose:
+            if (layout == MHA_Layout::QKV_INTERLEAVED) {
+                strideA[seqlen_transpose_dim_idx] = 3 * h * d;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = d;
+                strideA[batch_dim_idx]            = s_kv * 3 * h * d;
+            } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+                strideA[seqlen_transpose_dim_idx] = 2 * h * d;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = d;
+                strideA[batch_dim_idx]            = s_kv * 2 * h * d;
+            } else if (layout == MHA_Layout::SBH_INTERLEAVED) {
+                strideA[seqlen_transpose_dim_idx] = 3 * h * d * b;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = 3 * d;
+                strideA[batch_dim_idx]            = 3 * h * d;
+            } else {
+                strideA[seqlen_transpose_dim_idx] = h * d;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = d;
+                strideA[batch_dim_idx]            = s_kv * h * d;
+            }
+            break;
+        case MHA_Matrix::V_Matrix:
+            if (layout == MHA_Layout::QKV_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 3 * h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_kv * 3 * h * d;
+            } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 2 * h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_kv * 2 * h * d;
+            } else if (layout == MHA_Layout::SBH_INTERLEAVED) {
+                strideA[seqlen_dim_idx] = 3 * h * d * b;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = 3 * d;
+                strideA[batch_dim_idx]  = 3 * h * d;
+            } else {
+                strideA[seqlen_dim_idx] = h * d;
+                strideA[hidden_dim_idx] = 1;
+                strideA[head_dim_idx]   = d;
+                strideA[batch_dim_idx]  = s_kv * h * d;
+            }
+            break;
+        case MHA_Matrix::V_Matrix_Transpose:
+            if (layout == MHA_Layout::QKV_INTERLEAVED) {
+                strideA[seqlen_transpose_dim_idx] = 3 * h * d;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = d;
+                strideA[batch_dim_idx]            = s_kv * 3 * h * d;
+            } else if (layout == MHA_Layout::KV_INTERLEAVED) {
+                strideA[seqlen_transpose_dim_idx] = 2 * h * d;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = d;
+                strideA[batch_dim_idx]            = s_kv * 2 * h * d;
+            } else if (layout == MHA_Layout::SBH_INTERLEAVED) {
+                strideA[seqlen_transpose_dim_idx] = 3 * h * d * b;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = 3 * d;
+                strideA[batch_dim_idx]            = 3 * h * d;
+            } else {
+                strideA[seqlen_transpose_dim_idx] = h * d;
+                strideA[hidden_transpose_dim_idx] = 1;
+                strideA[head_dim_idx]             = d;
+                strideA[batch_dim_idx]            = s_kv * h * d;
+            }
+            break;
+        case MHA_Matrix::S_Matrix:
+            strideA[seqlen_kv_dim_idx] = 1;
+            strideA[seqlen_q_dim_idx]  = s_kv;
+            strideA[head_dim_idx]      = s_q * s_kv;
+            strideA[batch_dim_idx]     = h * s_q * s_kv;
+            break;
+        case MHA_Matrix::O_Matrix:
+            strideA[seqlen_kv_dim_idx] = 1;
+            strideA[seqlen_q_dim_idx]  = h * d;
+            strideA[head_dim_idx]      = d;
+            strideA[batch_dim_idx]     = s_q * h * d;
+            break;
+    }
+}
+
+// Used for CHWN
+void
+generate4dTransposeStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
+    // For INT8x4 and INT8x32 we still compute standard strides here to input
+    // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
+    try {
+        if (filterFormat == CUDNN_TENSOR_NCHW) {
+            throw std::runtime_error("[ERROR] NCHW tranpose not supported");
+        } else if (nbDims != 4) {
+            throw std::runtime_error("[ERROR] Only 4 dims supported");
+        } else {
+            // Here we assume that the format is NWHC getting tranposed to CHWN
+            strideA[0] = 1;                     // N has stride 1
+            strideA[3] = strideA[0] * dimA[0];  // W has stride strideN * dimN
+            strideA[2] = strideA[3] * dimA[3];  // H has stride strideW * dimW
+            strideA[1] = strideA[2] * dimA[2];  // C has stride strideH * dimH
+        }
+    } catch (std::exception& e) {
+        std::cout << "Exception: " << e.what() << std::endl;
+    }
+}
+
+// Convert a linear index
+// i = d_1 s_1 ... s_n + d_2 s_2 ... s_n + d_n-1 s_n + d_n
+// into a multidimensional index
+// (d_1, d_2, ..., d_n)
+void
+lin2dim(int64_t id, int64_t* ids, const int64_t* dims, int64_t length) {
+    int64_t idrem = id;
+    int64_t prod  = 1;  // accumulates the product of the dimensions
+    for (int64_t i = length - 1; i >= 0; i--) {
+        ids[i] = (idrem / prod) % dims[i];
+        idrem  = id - ids[i] * prod;
+        prod *= dims[i];
+    }
+}
+
+// Convert a multidimensional index
+// (d_1, d_2, ..., d_n)
+// into a linear index
+// i = d_1 s_1 + ... + d_n s_n
+int64_t
+dim2lin(const int64_t* ids, const int64_t* strides, int64_t length) {
+    int64_t res = 0;
+    for (int64_t i = 0; i < length; i++) {
+        res += ids[i] * strides[i];
+    }
+    return static_cast<int>(res);
+}
+void
+doEpilog(float* out, int64_t idx, float alphaAcc, float beta) {
+    if (beta == 0.f) {
+        out[idx] = alphaAcc;
+    } else {
+        out[idx] = alphaAcc + out[idx] * beta;
+    }
+}
+
+void
+doEpilog(half1* out, int64_t idx, float alphaAcc, float beta) {
+    if (beta == 0.f) {
+        out[idx] = cpu_float2half_rn(alphaAcc);
+    } else {
+        out[idx] = cpu_float2half_rn(alphaAcc + cpu_half2float(out[idx]) * beta);
+    }
+}
+
+void
+doEpilog(int8_t* out, int64_t idx, int32_t alphaAcc, float beta) {
+    int32_t val;
+    if (beta == 0.f) {
+        val = alphaAcc;
+    } else {
+        val = alphaAcc + int(float(out[idx]) * beta);
+    }
+    // Properly handle overflow errors in the same way cuDNN does
+    if (val > 127) {
+        val = 127;
+    } else if (val < -128) {
+        val = -128;
+    }
+    out[idx] = static_cast<int8_t>(val);
+}
+
+float
+getError(float dev, float ref) {
+    if (ref > 1.0 || ref < -1.0)
+        return (dev - ref) / ref;
+    else
+        return dev - ref;
+}
+
+float
+getError(half1 dev, half1 ref) {
+    if (cpu_half2float(ref) > 1.0 || cpu_half2float(ref) < -1.0)
+        return (cpu_half2float(dev) - cpu_half2float(ref)) / cpu_half2float(ref);
+    else
+        return cpu_half2float(dev) - cpu_half2float(ref);
+}
+
+int8_t
+getError(int8_t dev, int8_t ref) {
+    return dev - ref;
+}
+
+int64_t
+getFwdConvDilatedFilterDim(int64_t filterDim, int64_t dilation) {
+    return ((filterDim - 1) * dilation) + 1;
+}
+
+int64_t
+getFwdConvPaddedImageDim(int64_t tensorDim, int64_t pad) {
+    return tensorDim + (2 * pad);
+}
+
+int64_t
+getFwdConvOutputDim(int64_t tensorDim, int64_t pad, int64_t filterDim, int64_t stride, int64_t dilation) {
+    int64_t p =
+        (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
+    return (p);
+}
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/norm_samples.cpp b/third_party/cudnn-frontend/samples/legacy_samples/norm_samples.cpp
new file mode 100644
index 00000000..cbe1e783
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/norm_samples.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "norm_samples.h"
+#include <cudnn_frontend.h>
+
+#include "./utils/error_util.h"
+#include "./utils/helpers.h"
+
+bool
+AllowAll(cudnnBackendDescriptor_t engine_config) {
+    (void)engine_config;
+    return false;
+}
+
+cudnn_frontend::ExecutionPlan
+run_batch_norm_forward(cudnnHandle_t &handle_,
+                       int64_t *tensorDims,
+                       int64_t *perChannelSum,
+                       int64_t *epsilon,
+                       int64_t *peerDims,
+
+                       cudnnDataType_t data_type)
+
+{
+    std::cout << "================ Running Batch Norm Forward ======================= " << std::endl;
+    // Create the cudnn handle
+    checkCudnnErr(cudnnCreate(&handle_));
+
+    // Creates the necessary tensor descriptors
+    int64_t tensor_stride[4];
+    int64_t stride[4];
+    int64_t peer_stride[4];
+
+    // NHWC format. GenerateStrides() takes care of this. Howeever, tensor dims should still be NCHW
+    generateStrides(tensorDims, tensor_stride, 4, CUDNN_TENSOR_NHWC);
+    generateStrides(peerDims, peer_stride, 4, CUDNN_TENSOR_NHWC);
+
+    auto tensor_create = [&tensor_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
+        return cudnn_frontend::TensorBuilder()
+            .setDim(4, tensorDims)
+            .setStrides(4, tensor_stride)
+            .setId(id)
+            .setAlignment(16)
+            .setDataType(type)
+            .build();
+    };
+
+    auto peer_tensor_create = [&peer_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
+        return cudnn_frontend::TensorBuilder()
+            .setDim(4, tensorDims)
+            .setStrides(4, peer_stride)
+            .setId(id)
+            .setAlignment(16)
+            .setDataType(type)
+            .build();
+    };
+
+    generateStrides(perChannelSum, stride, 4, CUDNN_TENSOR_NHWC);
+
+    auto per_channel_tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
+        return cudnn_frontend::TensorBuilder()
+            .setDim(4, perChannelSum)
+            .setStrides(4, stride)
+            .setId(id)
+            .setAlignment(16)
+            .setDataType(type)
+            .build();
+    };
+
+    auto xTensor           = tensor_create(data_type, 100);
+    auto yTensor           = tensor_create(data_type, 101);
+    auto scaleTensor       = per_channel_tensor_create(CUDNN_DATA_FLOAT, 102);
+    auto biasTensor        = per_channel_tensor_create(CUDNN_DATA_FLOAT, 103);
+    auto inMeanTensor      = per_channel_tensor_create(CUDNN_DATA_FLOAT, 104);
+    auto inVarTensor       = per_channel_tensor_create(CUDNN_DATA_FLOAT, 105);
+    auto outMeanTensor     = per_channel_tensor_create(CUDNN_DATA_FLOAT, 106);
+    auto outVarTensor      = per_channel_tensor_create(CUDNN_DATA_FLOAT, 107);
+    auto savedMeanTensor   = per_channel_tensor_create(CUDNN_DATA_FLOAT, 108);
+    auto savedInvVarTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 109);
+
+    // Create the two peer stat tensors. Jump IDs in case we need to add more tensors with UIDs
+    auto peerStatTensor1 = peer_tensor_create(CUDNN_DATA_FLOAT, 200);
+    auto peerStatTensor2 = peer_tensor_create(CUDNN_DATA_FLOAT, 201);
+
+    int64_t epsilon_stride[4];
+    generateStrides(epsilon, epsilon_stride, 4, CUDNN_TENSOR_NHWC);
+    auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
+        return cudnn_frontend::TensorBuilder()
+            .setDim(4, epsilon)
+            .setStrides(4, epsilon_stride)
+            .setId(id)
+            .setAlignment(16)
+            .setDataType(type)
+            .setByValue(true)
+            .build();
+    };
+
+    auto epsilonTensor  = scalar_tensor_create(CUDNN_DATA_DOUBLE, 300);
+    auto expDecayTensor = scalar_tensor_create(CUDNN_DATA_DOUBLE, 301);
+
+#if (CUDNN_VERSION >= 8500)
+    // Batch normalization
+    cudnnBackendNormMode_t normalizationMode = CUDNN_BATCH_NORM;
+
+    // Forward training
+    cudnnBackendNormFwdPhase_t phase = CUDNN_NORM_FWD_TRAINING;
+
+    // Create a Finalize node
+    auto batch_norm_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
+                             .setNormalizationMode(normalizationMode)
+                             .setNormFwdPhase(phase)
+                             .setxDesc(xTensor)
+                             .setScaleAndBias(scaleTensor, biasTensor)
+                             .setPrevRunningMeanAndVar(inMeanTensor, inVarTensor)
+                             .setNextRunningMeanAndVar(outMeanTensor, outVarTensor)
+                             .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
+                             .setEpsilonTensor(epsilonTensor)
+                             .setExpDecayFactorTensor(expDecayTensor)
+                             .addPeerStatTensor(peerStatTensor1)  // Add the two peer stat tensors for GBN with 2 GPUs
+                             .addPeerStatTensor(peerStatTensor2)
+                             .setyDesc(yTensor)
+                             .build();
+
+    std::array<cudnn_frontend::Operation const *, 1> ops = {&batch_norm_op};
+#else
+    std::array<cudnn_frontend::Operation const *, 0> ops = {};
+#endif
+    auto opGraph =
+        cudnn_frontend::OperationGraphBuilder().setHandle(handle_).setOperationGraph(ops.size(), ops.data()).build();
+    std::cout << opGraph.describe() << std::endl;
+
+    cudnn_frontend::EngineConfigList filtered_configs;
+    auto statuses = cudnn_frontend::get_heuristics_list<2>(
+        {"heuristics_instant", "heuristics_fallback"}, opGraph, ::AllowAll, filtered_configs, true);
+
+    std::cout << "get_heuristics_list Statuses: ";
+    for (auto i = 0u; i < statuses.size(); i++) {
+        std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+    auto plan_builder = [&filtered_configs, &opGraph, &handle_]() {
+        for (auto i = 0u; i < filtered_configs.size(); i++) {
+            try {
+                auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                .setHandle(handle_)
+                                .setEngineConfig(filtered_configs[i], opGraph.getTag())
+                                .build();
+                return plan;
+            } catch (cudnn_frontend::cudnnException &e) {
+                continue;
+            }
+        }
+        return cudnn_frontend::ExecutionPlanBuilder()
+            .setHandle(handle_)
+            .setEngineConfig(filtered_configs[0], opGraph.getTag())
+            .build();
+    };
+
+    CHECK(filtered_configs.size() > 0);
+    auto plan = plan_builder();
+    std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+    return plan;
+}
+
+void
+execute_batch_norm_forward(cudnnHandle_t &handle_,
+                           cudnn_frontend::ExecutionPlan plan,
+                           void *xDevPtr,
+                           void *yDevPtr,
+                           void *scaledevPtr,
+                           void *biasdevPtr,
+                           void *in_meandevPtr,
+                           void *in_vardevPtr,
+                           void *out_meandevPtr,
+                           void *out_vardevPtr,
+                           void *saved_meandevPtr,
+                           void *saved_inv_vardevPtr,
+                           void *peer_devPtr1,
+                           void *peer_devPtr2,
+
+                           double epsilon_val,
+                           double exponential_decay_factor) {
+    try {
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << std::endl;
+
+        void *workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+
+        void *data_ptrs[14] = {xDevPtr,
+                               yDevPtr,
+                               scaledevPtr,
+                               biasdevPtr,
+                               in_meandevPtr,
+                               in_vardevPtr,
+                               out_meandevPtr,
+                               out_vardevPtr,
+                               saved_meandevPtr,
+                               saved_inv_vardevPtr,
+                               peer_devPtr1,
+                               peer_devPtr2,
+                               &epsilon_val,
+                               &exponential_decay_factor};
+        int64_t uids[14]    = {100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 200, 201, 300, 301};
+        auto variantPack    = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(14, data_ptrs)
+                               .setUids(14, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+        std::cout << "Batch normalization forward run completed successfully" << std::endl;
+
+    } catch (cudnn_frontend::cudnnException &e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major == 8) {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
+
+void
+run_batch_norm_backward(int64_t *tensorDims,
+                        int64_t *perChannelSum,
+                        int64_t *epsilon,
+                        int64_t *peerDims,
+
+                        void *xDevPtr,
+                        void *dyDevPtr,
+                        void *scaledevPtr,
+                        void *saved_meandevPtr,
+                        void *saved_inv_vardevPtr,
+                        void *peer_devPtr1,
+                        void *peer_devPtr2,
+                        void *dxDevPtr,
+                        void *dscaledevPtr,
+                        void *dbiasdevPtr,
+
+                        double epsilon_val,
+                        cudnnDataType_t data_type)
+
+{
+    cudnnHandle_t handle_;
+    std::cout << "================ Running Batch Norm Backward =======================" << std::endl;
+    (void)xDevPtr;
+    (void)dyDevPtr;
+    (void)scaledevPtr;
+    (void)saved_meandevPtr;
+    (void)saved_inv_vardevPtr;
+    (void)peer_devPtr1;
+    (void)peer_devPtr2;
+    (void)dscaledevPtr;
+    (void)dbiasdevPtr;
+    (void)dxDevPtr;
+
+    (void)epsilon_val;
+    try {
+        // Create the cudnn handle
+        checkCudnnErr(cudnnCreate(&handle_));
+
+        // Creates the necessary tensor descriptors
+        int64_t tensor_stride[4];
+        int64_t stride[4];
+        int64_t peer_stride[4];
+
+        // NHWC format. GenerateStrides() takes care of this. Howeever, tensor dims should still be NCHW
+        generateStrides(tensorDims, tensor_stride, 4, CUDNN_TENSOR_NHWC);
+        generateStrides(peerDims, peer_stride, 4, CUDNN_TENSOR_NHWC);
+
+        auto tensor_create = [&tensor_stride, &tensorDims](cudnnDataType_t type, int64_t id) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, tensorDims)
+                .setStrides(4, tensor_stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .build();
+        };
+
+        auto peer_tensor_create = [&peer_stride, &peerDims](cudnnDataType_t type, int64_t id) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, peerDims)
+                .setStrides(4, peer_stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .build();
+        };
+
+        generateStrides(perChannelSum, stride, 4, CUDNN_TENSOR_NHWC);
+
+        auto per_channel_tensor_create = [&stride, &perChannelSum](cudnnDataType_t type, int64_t id) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, perChannelSum)
+                .setStrides(4, stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .build();
+        };
+
+        auto xTensor           = tensor_create(data_type, 100);
+        auto dyTensor          = tensor_create(data_type, 101);
+        auto scaleTensor       = per_channel_tensor_create(CUDNN_DATA_FLOAT, 102);
+        auto savedMeanTensor   = per_channel_tensor_create(CUDNN_DATA_FLOAT, 103);
+        auto savedInvVarTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 104);
+        auto peerStatTensor1 =
+            peer_tensor_create(CUDNN_DATA_FLOAT, 105);  // Create 2 peer stat tensors for GBN with 2 GPUs
+        auto peerStatTensor2 = peer_tensor_create(CUDNN_DATA_FLOAT, 106);
+
+        auto dxTensor     = tensor_create(data_type, 200);
+        auto dScaleTensor = per_channel_tensor_create(CUDNN_DATA_FLOAT, 201);
+        auto dBiasTensor  = per_channel_tensor_create(CUDNN_DATA_FLOAT, 202);
+
+        std::cout << dxTensor.describe() << std::endl;
+        std::cout << dScaleTensor.describe() << std::endl;
+        std::cout << dBiasTensor.describe() << std::endl;
+
+        int64_t epsilon_stride[4];
+        generateStrides(epsilon, epsilon_stride, 4, CUDNN_TENSOR_NHWC);
+        auto scalar_tensor_create = [&epsilon_stride, &epsilon](cudnnDataType_t type, int64_t id) {
+            return cudnn_frontend::TensorBuilder()
+                .setDim(4, epsilon)
+                .setStrides(4, epsilon_stride)
+                .setId(id)
+                .setAlignment(16)
+                .setDataType(type)
+                .setByValue(true)
+                .build();
+        };
+
+        auto epsilonTensor = scalar_tensor_create(CUDNN_DATA_DOUBLE, 300);
+
+#if (CUDNN_VERSION >= 8500)
+        // Batch normalization
+        cudnnBackendNormMode_t normalizationMode = CUDNN_BATCH_NORM;
+
+        // Create a Finalize node
+        auto batch_norm_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR)
+                                 .setNormalizationMode(normalizationMode)
+                                 .setxDesc(xTensor)
+                                 .setSavedMeanAndInvVar(savedMeanTensor, savedInvVarTensor)
+                                 .setdyDesc(dyTensor)
+                                 .setScale(scaleTensor)
+                                 .setEpsilonTensor(epsilonTensor)
+                                 .setDScaleAndDBias(dScaleTensor, dBiasTensor)
+                                 .setdxDesc(dxTensor)
+                                 .addPeerStatTensor(peerStatTensor1)  // Add the 2 peer stat tensors for GBN with 2 GPUs
+                                 .addPeerStatTensor(peerStatTensor2)
+                                 .build();
+
+        std::array<cudnn_frontend::Operation const *, 1> ops = {&batch_norm_op};
+        auto opGraph                                         = cudnn_frontend::OperationGraphBuilder()
+                           .setHandle(handle_)
+                           .setOperationGraph(ops.size(), ops.data())
+                           .build();
+        std::cout << opGraph.describe() << std::endl;
+
+        cudnn_frontend::EngineConfigList filtered_configs;
+        auto statuses = cudnn_frontend::get_heuristics_list<2>(
+            {"heuristics_instant", "heuristics_fallback"}, opGraph, ::AllowAll, filtered_configs, true);
+
+        std::cout << "get_heuristics_list Statuses: ";
+        for (auto i = 0u; i < statuses.size(); i++) {
+            std::cout << cudnn_frontend::to_string(statuses[i]) << " ";
+        }
+        std::cout << std::endl;
+        std::cout << "Filter config list has " << filtered_configs.size() << " configurations " << std::endl;
+
+        auto plan_builder = [&filtered_configs, &opGraph, &handle_]() {
+            for (auto i = 0u; i < filtered_configs.size(); i++) {
+                try {
+                    auto plan = cudnn_frontend::ExecutionPlanBuilder()
+                                    .setHandle(handle_)
+                                    .setEngineConfig(filtered_configs[i], opGraph.getTag())
+                                    .build();
+                    return plan;
+                } catch (cudnn_frontend::cudnnException &e) {
+                    continue;
+                }
+            }
+            return cudnn_frontend::ExecutionPlanBuilder()
+                .setHandle(handle_)
+                .setEngineConfig(filtered_configs[0], opGraph.getTag())
+                .build();
+        };
+
+        REQUIRE(filtered_configs.size() > 0);
+        auto plan = plan_builder();
+        std::cout << "Plan tag: " << plan.getTag() << std::endl;
+
+        auto workspace_size = plan.getWorkspaceSize();
+        std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl;
+
+        void *workspace_ptr = nullptr;
+        if (workspace_size > 0) {
+            checkCudaErr(cudaMalloc(&workspace_ptr, workspace_size));
+        }
+
+        constexpr int var_pack_size    = 11;
+        void *data_ptrs[var_pack_size] = {xDevPtr,
+                                          dyDevPtr,
+                                          scaledevPtr,
+                                          saved_meandevPtr,
+                                          saved_inv_vardevPtr,
+                                          peer_devPtr1,
+                                          peer_devPtr2,
+                                          &epsilon_val,
+                                          dxDevPtr,
+                                          dscaledevPtr,
+                                          dbiasdevPtr};
+        int64_t uids[var_pack_size]    = {100, 101, 102, 103, 104, 105, 106, 300, 200, 201, 202};
+        auto variantPack               = cudnn_frontend::VariantPackBuilder()
+                               .setWorkspacePointer(workspace_ptr)
+                               .setDataPointers(var_pack_size, data_ptrs)
+                               .setUids(var_pack_size, uids)
+                               .build();
+        std::cout << "variantPack " << variantPack.describe() << std::endl;
+        cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+
+        checkCudaErr(cudaDeviceSynchronize());
+        if (workspace_size > 0) {
+            checkCudaErr(cudaFree(workspace_ptr));
+        }
+
+        cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error", status);
+
+        std::cout << "Batch normalization backward run completed successfully" << std::endl;
+#endif
+    } catch (cudnn_frontend::cudnnException &e) {
+        struct cudaDeviceProp prop;
+        checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+        if (prop.major == 8) {
+            std::cout << "[ERROR] Exception " << e.what() << std::endl;
+            CHECK(false);
+        }
+    }
+}
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/norm_samples.h b/third_party/cudnn-frontend/samples/legacy_samples/norm_samples.h
new file mode 100644
index 00000000..480c3aa7
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/norm_samples.h
@@ -0,0 +1,136 @@
+#pragma once
+
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include <tuple>
+#include <functional>
+
+#include <cudnn_frontend.h>
+
+/**
+ * @brief Run a Group BN forward sample with 2 peer stat tensors.
+ *
+ * @param tensorDims an array with shape (N, C, H, W) for input tensor dims. Stride in NHWC or NCHW will take care of
+ memory format
+ * @param perChannelSum an array with shape (1, C, 1, 1) to denote the sum values for each channel in the input tensor
+ * @param epsilon a scalar array with shape (1, 1, 1, 1) to represent the epsilon value for the BN
+ * @param peerDims an array with shape (num GPUs, 2 * C, 1, 1) to denote the tensor dimensions for peer stat tensor in
+ GBN
+
+ *
+ */
+cudnn_frontend::ExecutionPlan
+run_batch_norm_forward(cudnnHandle_t &handle_,
+                       int64_t *tensorDims,
+                       int64_t *perChannelSum,
+                       int64_t *epsilon,
+                       int64_t *peerDims,
+                       cudnnDataType_t in_out_data_type);
+/**
+ * @param xDevPtr input tensor device pointer
+ * @param yDevPtr output tensor device pointer
+ * @param scaledevPtr input scale device pointer for BN scaling
+ * @param biasdevPtr input scale device pointer for BN bias
+ * @param in_meandevPtr Input mean device pointer
+ * @param in_vardevPtr Input variance device pointer
+ * @param out_meandevPtr output mean device pointer
+ * @param out_vardevPtr output variance device pointer
+ * @param saved_meandevPtr saved mean device pointer for BN backward
+ * @param saved_inv_vardevPtr saved inverse variance device pointer for BN backward
+ * @param peer_devPtr1 peer stat tensor 1 device pointer
+ * @param peer_devPtr2 peer stat tensor 2 device pointer
+ * @param epsilon_val episilon value as a double
+ * @param exponential_decay_factor exponential_decay_factor as a value
+ *
+ **/
+void
+execute_batch_norm_forward(cudnnHandle_t &handle_,
+                           cudnn_frontend::ExecutionPlan plan,
+                           void *xDevPtr,
+                           void *yDevPtr,
+                           void *scaledevPtr,
+                           void *biasdevPtr,
+                           void *in_meandevPtr,
+                           void *in_vardevPtr,
+                           void *out_meandevPtr,
+                           void *out_vardevPtr,
+                           void *saved_meandevPtr,
+                           void *saved_inv_vardevPtr,
+                           void *peer_devPtr1,
+                           void *peer_devPtr2,
+
+                           double epsilon_val,
+                           double exponential_decay_factor);
+
+/**
+ * @brief Run a Group BN backward sample with 2 peer stat tensors.
+ *
+ * @param tensorDims an array with shape (N, C, H, W) for input tensor dims. Stride in NHWC or NCHW will take care of
+ * memory format
+ * @param perChannelSum an array with shape (1, C, 1, 1) to denote the sum values for each channel in the input tensor
+ * @param epsilon a scalar array with shape (1, 1, 1, 1) to represent the epsilon value for the BN
+ * @param peerDims an array with shape (num GPUs, 2 * C, 1, 1) to denote the tensor dimensions for peer stat tensor in
+ * GBN
+ * @param xDevPtr input tensor device pointer
+ * @param yDevPtr output tensor device pointer
+ * @param scaledevPtr input scale device pointer for BN scaling
+ * @param biasdevPtr input scale device pointer for BN bias
+ * @param in_meandevPtr Input mean device pointer
+ * @param in_vardevPtr Input variance device pointer
+ * @param out_meandevPtr output mean device pointer
+ * @param out_vardevPtr output variance device pointer
+ * @param saved_meandevPtr saved mean device pointer for BN backward
+ * @param saved_inv_vardevPtr saved inverse variance device pointer for BN backward
+ * @param peer_devPtr1 peer stat tensor 1 device pointer
+ * @param peer_devPtr2 peer stat tensor 2 device pointer
+ * @param epsilon_val episilon value as a double
+ * @param exponential_decay_factor exponential_decay_factor as a value
+ *
+ */
+void
+run_batch_norm_backward(int64_t *tensorDims,
+                        int64_t *perChannelSum,
+                        int64_t *epsilon,
+                        int64_t *peerDims,
+
+                        void *xDevPtr,
+                        void *dyDevPtr,
+                        void *scaledevPtr,
+                        void *saved_meandevPtr,
+                        void *saved_inv_vardevPtr,
+                        void *peer_devPtr1,
+                        void *peer_devPtr2,
+                        void *dscaledevPtr,
+                        void *dbiasdevPtr,
+                        void *dxDevPtr,
+
+                        double epsilon_val,
+                        cudnnDataType_t in_out_data_type);
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/test_list.cpp b/third_party/cudnn-frontend/samples/legacy_samples/test_list.cpp
new file mode 100644
index 00000000..81b5ce65
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/test_list.cpp
@@ -0,0 +1,3831 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include <catch2/catch_test_macros.hpp>
+#include <cudnn.h>
+
+#include "cpu_references.h"
+#include "conv_sample.h"
+#include "fusion_sample.h"
+#include "fp8_sample.h"
+#include "fp8_flash_mha_sample.h"
+#include "f16_flash_mha_sample.h"
+#include "fused_mha_sample.h"
+#include "norm_samples.h"
+
+TEST_CASE("Tensor creation comparison", "[frontend][comparison][backend]") {
+    // Consider creation of a 2d Tensor
+    // n,c,h,w as 4,32,32,32
+    std::cout << "Tensor creation comparison" << std::endl;
+    std::array<int64_t, 4> tensor_dim = {4, 32, 32, 32};
+    std::array<int64_t, 4> tensor_str = {32768, 1024, 32, 1};  // NCHW format
+    cudnnDataType_t data_type         = CUDNN_DATA_FLOAT;
+    int64_t alignment                 = sizeof(float);
+    int64_t id                        = 0xD0D0CACA;  // Some magic number
+
+    // Creating Frontend code
+
+    try {
+        auto tensor = cudnn_frontend::TensorBuilder()
+                          .setDim(tensor_dim.size(), tensor_dim.data())
+                          .setStrides(tensor_str.size(), tensor_str.data())
+                          .setId(id)
+                          .setAlignment(alignment)
+                          .setDataType(data_type)
+                          .build();
+    } catch (cudnn_frontend::cudnnException& e) {
+        std::cout << "Exception in tensor creation " << e.what() << std::endl;
+    }
+
+    auto check_status = [](cudnnStatus_t status) { REQUIRE(status == CUDNN_STATUS_SUCCESS); };
+
+    // Equivalent Backend code
+    {
+        cudnnBackendDescriptor_t tensor;
+
+        // Allocate memory for the descriptor
+        // This is a c-style malloc which requires
+        // a equivalent 1-time deletion. Raw backend code
+        // requires tracking allocation and free unlike raw
+        // pointers, else it may lead to memory leak.
+        check_status(cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &tensor));
+
+        // Set the following attributes
+        // Dimensions, Strides, Alignment, Id, DataType
+        check_status(
+            cudnnBackendSetAttribute(tensor, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &data_type));
+        check_status(cudnnBackendSetAttribute(
+            tensor, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, tensor_dim.size(), tensor_dim.data()));
+        check_status(cudnnBackendSetAttribute(
+            tensor, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, tensor_str.size(), tensor_str.data()));
+        check_status(cudnnBackendSetAttribute(tensor, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &id));
+        check_status(
+            cudnnBackendSetAttribute(tensor, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment));
+        // Finalize the descriptor
+        check_status(cudnnBackendFinalize(tensor));
+
+        // Free the memory allocated above. Any short-circuit return will
+        // cause a memory leak.
+        check_status(cudnnBackendDestroyDescriptor(tensor));
+    }
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use global(index) for execution", "[frontend][global_index][wgrad]") {
+    std::cout << "TEST_CASE :: Use  global index for engine generation" << std::endl;
+    INFO("TEST_CASE :: Use  global index for engine generation");
+    int64_t dimA[]        = {1, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Wsize);
+
+    run_from_global_index(dimA,
+                          padA,
+                          convstrideA,
+                          dilationA,
+                          filterdimA,
+                          outdimA,
+                          CUDNN_DATA_FLOAT,
+                          mode,
+                          sm.devPtrX,
+                          sm.devPtrW,
+                          sm.devPtrY);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostW, sm.devPtrW, (size_t)(sizeof(sm.hostW[0]) * Wsize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    weightGrad_cpu_ref<float>(sm.hostX,
+                              sm.hostY,
+                              sm.host_ref,
+                              CUDNN_TENSOR_NCHW,
+                              dimA,
+                              filterdimA,
+                              outdimA,
+                              convstrideA,
+                              padA,
+                              dilationA,
+                              4 /*Dims*/);
+
+    for (int index = 0; index < Wsize; index++) {  // assuming in data is packed
+        float diff = getError(sm.hostW[index], sm.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use heuristics for execution", "[frontend][heuristics][conv]") {
+    std::cout << "TEST_CASE :: Use heuristics for engine generation" << std::endl;
+    INFO("TEST_CASE :: Use heuristics for engine generation");
+    int64_t dimA[]        = {8, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Ysize);
+
+    run_from_heuristics(dimA,
+                        padA,
+                        convstrideA,
+                        dilationA,
+                        filterdimA,
+                        outdimA,
+                        CUDNN_DATA_FLOAT,
+                        mode,
+                        sm.devPtrX,
+                        sm.devPtrW,
+                        sm.devPtrY,
+                        CUDNN_HEUR_MODE_INSTANT);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    conv_cpu_ref<float, float>(sm.hostX,
+                               sm.hostW,
+                               sm.host_ref,
+                               1,
+                               CUDNN_TENSOR_NCHW,
+                               dimA,
+                               filterdimA,
+                               outdimA,
+                               convstrideA,
+                               padA,
+                               dilationA,
+                               4 /*Dims*/);
+
+    for (int index = 0; index < Ysize; index++) {  // assuming in data is packed
+        float diff = getError(sm.hostY[index], sm.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use DNN based heuristics for execution", "[frontend][dnn_heuristics][conv]") {
+    std::cout << "Use DNN based heuristics for execution" << std::endl;
+    INFO("TEST_CASE :: Use DNN based heuristics for engine generation");
+    int64_t dimA[]        = {8, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Ysize);
+
+    run_from_heuristics(dimA,
+                        padA,
+                        convstrideA,
+                        dilationA,
+                        filterdimA,
+                        outdimA,
+                        CUDNN_DATA_FLOAT,
+                        mode,
+                        sm.devPtrX,
+                        sm.devPtrW,
+                        sm.devPtrY,
+                        CUDNN_HEUR_MODE_B);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    conv_cpu_ref<float, float>(sm.hostX,
+                               sm.hostW,
+                               sm.host_ref,
+                               1,
+                               CUDNN_TENSOR_NCHW,
+                               dimA,
+                               filterdimA,
+                               outdimA,
+                               convstrideA,
+                               padA,
+                               dilationA,
+                               4 /*Dims*/);
+
+    for (int index = 0; index < Ysize; index++) {  // assuming in data is packed
+        float diff = getError(sm.hostY[index], sm.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use fallback for execution", "[frontend][global_index][dgrad]") {
+    std::cout << "TEST_CASE :: Use  fallback index for engine generation" << std::endl;
+    INFO("TEST_CASE :: Use  fallback index for engine generation");
+    int64_t dimA[]        = {1, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Xsize);
+
+    auto status = run_with_external_config(dimA,
+                                           padA,
+                                           convstrideA,
+                                           dilationA,
+                                           filterdimA,
+                                           outdimA,
+                                           CUDNN_DATA_FLOAT,
+                                           mode,
+                                           sm.devPtrX,
+                                           sm.devPtrW,
+                                           sm.devPtrY);
+    REQUIRE(status == CUDNN_STATUS_SUCCESS);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostX, sm.devPtrX, (size_t)(sizeof(sm.hostX[0]) * Xsize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    dataGrad_cpu_ref<float>(sm.hostW,
+                            sm.hostY,
+                            sm.host_ref,
+                            CUDNN_TENSOR_NCHW,
+                            dimA,
+                            filterdimA,
+                            outdimA,
+                            convstrideA,
+                            padA,
+                            dilationA,
+                            4 /*Dims*/,
+                            mode);
+
+    for (int index = 0; index < Xsize; index++) {  // assuming in data is packed
+        float diff = getError(sm.hostX[index], sm.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvBiasAct sample", "[frontend][convAddBiasAct]") {
+    std::cout << "TEST_CASE :: Sample convAddBiasAct multi Operation code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample multi Operation code with backend API");
+    int64_t xTensorDim[] = {1, 32, 4, 4};
+    int64_t wTensorDim[] = {32, 32, 1, 1};
+    int64_t yTensorDim[] = {0, 0, 0, 0};  // Computed Below
+    int64_t padding[]    = {0, 0};
+    int64_t dilation[]   = {1, 1};
+    int64_t convstride[] = {1, 1};
+
+    yTensorDim[0] = xTensorDim[0];
+    yTensorDim[1] = wTensorDim[0];
+    for (int dim = 0; dim < 2; dim++) {
+        yTensorDim[dim + 2] =
+            getFwdConvOutputDim(xTensorDim[dim + 2], padding[dim], wTensorDim[dim + 2], convstride[dim], dilation[dim]);
+    }
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Xsize = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    int64_t Wsize = wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3];
+    int64_t Bsize = yTensorDim[0] * yTensorDim[1] * 1 * 1;
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Bsize, true);
+
+    run_conv_add_bias_activation(xTensorDim,
+                                 padding,
+                                 convstride,
+                                 dilation,
+                                 wTensorDim,
+                                 yTensorDim,
+                                 CUDNN_DATA_FLOAT,
+                                 sm.devPtrX,
+                                 sm.devPtrW,
+                                 sm.devPtrY,
+                                 sm.devPtrZ,
+                                 sm.devPtrB);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use cudnnFindPlan for execution", "[frontend][cudnnFindPlan][conv]") {
+    std::cout << "TEST_CASE :: Use cudnnFindPlan for plan generation" << std::endl;
+    INFO("TEST_CASE :: Use cudnnFindPlan for plan generation");
+    int64_t dimA[]        = {8, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Ysize);
+
+    run_from_cudnn_find(dimA,
+                        padA,
+                        convstrideA,
+                        dilationA,
+                        filterdimA,
+                        outdimA,
+                        CUDNN_DATA_FLOAT,
+                        mode,
+                        sm.devPtrX,
+                        sm.devPtrW,
+                        sm.devPtrY);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    conv_cpu_ref<float, float>(sm.hostX,
+                               sm.hostW,
+                               sm.host_ref,
+                               1,
+                               CUDNN_TENSOR_NCHW,
+                               dimA,
+                               filterdimA,
+                               outdimA,
+                               convstrideA,
+                               padA,
+                               dilationA,
+                               4 /*Dims*/);
+
+    for (int index = 0; index < Ysize; index++) {  // assuming in data is packed
+        float diff = getError(sm.hostY[index], sm.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvBiasAct sample with cudnnFindPlan", "[frontend][cudnnFindPlan][convAddBiasAct]") {
+    std::cout << "TEST_CASE :: Sample multi Operation code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample multi Operation code with backend API");
+    int64_t xTensorDim[] = {1, 32, 4, 4};
+    int64_t wTensorDim[] = {32, 32, 1, 1};
+    int64_t yTensorDim[] = {0, 0, 0, 0};  // Computed Below
+    int64_t padding[]    = {0, 0};
+    int64_t dilation[]   = {1, 1};
+    int64_t convstride[] = {1, 1};
+
+    yTensorDim[0] = xTensorDim[0];
+    yTensorDim[1] = wTensorDim[0];
+    for (int dim = 0; dim < 2; dim++) {
+        yTensorDim[dim + 2] =
+            getFwdConvOutputDim(xTensorDim[dim + 2], padding[dim], wTensorDim[dim + 2], convstride[dim], dilation[dim]);
+    }
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Xsize = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    int64_t Wsize = wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3];
+    int64_t Bsize = yTensorDim[0] * yTensorDim[1] * 1 * 1;
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Bsize, true);
+
+    run_conv_add_bias_activation_with_cudnn_find(xTensorDim,
+                                                 padding,
+                                                 convstride,
+                                                 dilation,
+                                                 wTensorDim,
+                                                 yTensorDim,
+                                                 CUDNN_DATA_HALF,
+                                                 sm.devPtrX,
+                                                 sm.devPtrW,
+                                                 sm.devPtrY,
+                                                 sm.devPtrZ,
+                                                 sm.devPtrB);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use cudnnGetPlan for execution", "[frontend][cudnnGetPlan][conv]") {
+    std::cout << "TEST_CASE :: Use cudnnGetPlan for plan generation" << std::endl;
+    INFO("TEST_CASE :: Use cudnnGetPlan for plan generation");
+    int64_t dimA[]        = {8, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Ysize);
+
+    run_from_cudnn_get(dimA,
+                       padA,
+                       convstrideA,
+                       dilationA,
+                       filterdimA,
+                       outdimA,
+                       CUDNN_DATA_FLOAT,
+                       mode,
+                       sm.devPtrX,
+                       sm.devPtrW,
+                       sm.devPtrY);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    conv_cpu_ref<float, float>(sm.hostX,
+                               sm.hostW,
+                               sm.host_ref,
+                               1,
+                               CUDNN_TENSOR_NCHW,
+                               dimA,
+                               filterdimA,
+                               outdimA,
+                               convstrideA,
+                               padA,
+                               dilationA,
+                               4 /*Dims*/);
+
+    for (int index = 0; index < Ysize; index++) {  // assuming in data is packed
+        float diff = getError(sm.hostY[index], sm.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvScaleBiasAddAct sample", "[frontend][fusion][ConvScaleBiasAddAct]") {
+    std::cout << "TEST_CASE :: ConvScaleBiasAddAct sample" << std::endl;
+    INFO("TEST_CASE :: ConvScaleBiasAddAct sample");
+    int64_t xTensorDim[] = {4, 24, 31, 31};
+    int64_t wTensorDim[] = {32, 24, 3, 3};
+    int64_t yTensorDim[] = {4, 32, 31, 31};
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t sTensorDim[] = {1, 32, 1, 1};    // scale
+    int64_t bTensorDim[] = {1, 32, 1, 1};    // bias
+    int64_t aTensorDim[] = {4, 32, 31, 31};  // add
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<half> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<half> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<half> Y(Ysize, true);
+
+    Surface<half> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+    Surface<half> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<half> A(aTensorDim[0] * aTensorDim[1] * aTensorDim[2] * aTensorDim[3], false);
+
+    run_conv_scale_bias_add_leaky_relu(xTensorDim,
+                                       wTensorDim,
+                                       yTensorDim,
+                                       sTensorDim,
+                                       bTensorDim,
+                                       aTensorDim,
+                                       CUDNN_DATA_HALF,
+                                       2,
+                                       conv_padA,
+                                       conv_dilationA,
+                                       conv_strideA,
+                                       X.devPtr,
+                                       W.devPtr,
+                                       Y.devPtr,
+                                       S.devPtr,
+                                       B.devPtr,
+                                       A.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvScaleBiasAddAct sample_float", "[frontend][fusion][ConvScaleBiasAddAct]") {
+    std::cout << "TEST_CASE :: ConvScaleBiasAddAct sample_float" << std::endl;
+    INFO("TEST_CASE :: ConvScaleBiasAddAct sample_float");
+    int64_t xTensorDim[] = {4, 24, 512, 512};
+    int64_t wTensorDim[] = {32, 24, 3, 3};
+    int64_t yTensorDim[] = {4, 32, 512, 512};
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t sTensorDim[] = {1, 32, 1, 1};      // scale
+    int64_t bTensorDim[] = {1, 32, 1, 1};      // bias
+    int64_t aTensorDim[] = {4, 32, 512, 512};  // add
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<float> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<float> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<float> Y(Ysize, true);
+
+    Surface<float> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+    Surface<float> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<float> A(aTensorDim[0] * aTensorDim[1] * aTensorDim[2] * aTensorDim[3], false);
+
+    run_conv_scale_bias_add_leaky_relu(xTensorDim,
+                                       wTensorDim,
+                                       yTensorDim,
+                                       sTensorDim,
+                                       bTensorDim,
+                                       aTensorDim,
+                                       CUDNN_DATA_FLOAT,
+                                       2,
+                                       conv_padA,
+                                       conv_dilationA,
+                                       conv_strideA,
+                                       X.devPtr,
+                                       W.devPtr,
+                                       Y.devPtr,
+                                       S.devPtr,
+                                       B.devPtr,
+                                       A.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvBiasScaleAct sample", "[frontend][fusion][ConvBiasScaleAct]") {
+    std::cout << "TEST_CASE ConvBiasScaleAct :: ConvBiasScaleAct sample" << std::endl;
+    INFO("TEST_CASE :: ConvBiasScaleAct sample");
+    int64_t xTensorDim[] = {1, 16, 512, 512};
+    int64_t wTensorDim[] = {64, 16, 3, 3};
+    int64_t yTensorDim[] = {1, 64, 512, 512};
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t bTensorDim[] = {1, 64, 1, 1};  // bias
+    int64_t sTensorDim[] = {1, 64, 1, 1};  // scale
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<float> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<float> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<float> Y(Ysize, true);
+
+    Surface<float> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<float> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+
+    run_conv_bias_scale_relu(xTensorDim,
+                             wTensorDim,
+                             yTensorDim,
+                             bTensorDim,
+                             sTensorDim,
+                             CUDNN_DATA_HALF,
+                             2,
+                             conv_padA,
+                             conv_dilationA,
+                             conv_strideA,
+                             X.devPtr,
+                             W.devPtr,
+                             Y.devPtr,
+                             B.devPtr,
+                             S.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvBiasScaleActSerialization sample", "[frontend][fusion][serialization]") {
+    std::cout << "TEST_CASE Serialization :: Sample serialization for runtime fusion with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample serialization for runtime fusion code with backend API");
+    int64_t xTensorDim[] = {1, 16, 512, 512};
+    int64_t wTensorDim[] = {64, 16, 3, 3};
+    int64_t yTensorDim[] = {1, 64, 512, 512};
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t bTensorDim[] = {1, 64, 1, 1};  // bias
+    int64_t sTensorDim[] = {1, 64, 1, 1};  // scale
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<float> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<float> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<float> Y(Ysize, true);
+
+    Surface<float> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<float> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+
+    run_serialization_conv_bias_scale_relu(xTensorDim,
+                                           wTensorDim,
+                                           yTensorDim,
+                                           bTensorDim,
+                                           sTensorDim,
+                                           CUDNN_DATA_HALF,
+                                           2,
+                                           conv_padA,
+                                           conv_dilationA,
+                                           conv_strideA,
+                                           X.devPtr,
+                                           W.devPtr,
+                                           Y.devPtr,
+                                           B.devPtr,
+                                           S.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvScaleBiasActGenIndexSelection sample", "[frontend][fusion][ConvScaleBiasActGenIndexSelection]") {
+    std::cout << "TEST_CASE :: ConvScaleBiasActGenIndexSelection sample" << std::endl;
+    INFO("TEST_CASE :: ConvScaleBiasActGenIndexSelection sample");
+    int64_t xTensorDim[] = {1, 64, 168, 200};
+    int64_t wTensorDim[] = {64, 64, 3, 3};
+    int64_t yTensorDim[] = {1, 64, 168, 200};
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t bTensorDim[] = {1, 64, 1, 1};  // bias
+    int64_t sTensorDim[] = {1, 64, 1, 1};  // scale
+
+    int64_t thresholdTensorDim[] = {1, 1, 1, 1};  // scalar number
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<half> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<half> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<half> Y(Ysize, true);
+
+    Surface<half> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<half> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+
+    Surface<int32_t> thresholdTop(1, false);
+    Surface<int32_t> thresholdBottom(1, false);
+
+    thresholdTop.hostPtr[0]    = 1;
+    thresholdBottom.hostPtr[0] = 198;
+
+    checkCudaErr(cudaMemcpy(thresholdTop.devPtr, thresholdTop.hostPtr, sizeof(int32_t), cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMemcpy(thresholdBottom.devPtr, thresholdBottom.hostPtr, sizeof(int32_t), cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    run_conv_scale_bias_relu_gen_index_selection(xTensorDim,
+                                                 wTensorDim,
+                                                 yTensorDim,
+                                                 bTensorDim,
+                                                 sTensorDim,
+                                                 thresholdTensorDim,
+                                                 CUDNN_DATA_HALF,
+                                                 2,  // spatial dimensions in conv
+                                                 conv_padA,
+                                                 conv_dilationA,
+                                                 conv_strideA,
+                                                 2,  // index according to H dim (or P dim in y)
+                                                 X.devPtr,
+                                                 W.devPtr,
+                                                 Y.devPtr,
+                                                 B.devPtr,
+                                                 S.devPtr,
+                                                 thresholdTop.devPtr,
+                                                 thresholdBottom.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvScaleBiasAct_int8 sample", "[frontend][fusion][ConvScaleBiasAct_int8]") {
+    std::cout << "TEST_CASE :: ConvScaleBiasAct_int8 sample" << std::endl;
+    INFO("TEST_CASE :: ConvScaleBiasAct_int8 sample");
+    int64_t xTensorDim[] = {16, 128, 16, 16};
+    int64_t wTensorDim[] = {256, 128, 1, 1};
+    int64_t yTensorDim[] = {16, 256, 16, 16};
+
+    int64_t conv_padA[]      = {0, 0};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t bTensorDim[] = {1, 256, 1, 1};  // bias
+    int64_t sTensorDim[] = {1, 256, 1, 1};  // scale
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<int8_t> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<int8_t> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<int8_t> Y(Ysize, true);
+
+    Surface<float> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<float> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+
+    run_conv_scale_bias_relu_int8(xTensorDim,
+                                  wTensorDim,
+                                  yTensorDim,
+                                  bTensorDim,
+                                  sTensorDim,
+                                  2,
+                                  conv_padA,
+                                  conv_dilationA,
+                                  conv_strideA,
+                                  X.devPtr,
+                                  W.devPtr,
+                                  Y.devPtr,
+                                  B.devPtr,
+                                  S.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("PoolScaleBiasAct_int8 sample", "[pooling][forward][avgerage_pooling]") {
+    std::cout << "TEST_CASE PoolScaleBiasAct_int8 :: Sample PoolScaleBiasAct_int8 fusion code with backend API"
+              << std::endl;
+    INFO("TEST_CASE :: PoolScaleBiasAct_int8 sample");
+
+    int64_t xTensorDim[] = {16, 16, 32, 32};
+    int64_t yTensorDim[] = {16, 16, 16, 16};
+    int64_t bTensorDim[] = {1, 16, 1, 1};  // bias
+    int64_t sTensorDim[] = {1, 16, 1, 1};  // scale
+
+    cudnnDataType_t compType                         = CUDNN_DATA_FLOAT;
+    auto const nanOpt                                = CUDNN_PROPAGATE_NAN;
+    cudnn_frontend::ResampleMode_t const mode        = cudnn_frontend::ResampleMode_t::AVGPOOL_INCLUDE_PADDING;
+    cudnn_frontend::PaddingMode_t const padding_mode = cudnn_frontend::PaddingMode_t::ZERO_PAD;
+
+    int64_t nbSpatialDims = 2;
+    double alpha          = 1.0;
+    double beta           = 0.0;
+
+    /* Shape attributes
+     * There are two parameter types viz., int64_t and cudnnFractiontype_t that are supported for the below attributes
+     * Both types are interchangeable
+     * cudnnFractionType_t can be used for modes that require non integer parameters(e.g., adaptive pooling )
+     * */
+    // Illustration: Initiliase the windowDimA as cudnnFractionType {numerator, denoniminator}
+    // cudnnFraction_t windowDimA[CUDNN_DIM_MAX] = {{2,1},{2,1}};
+    // cudnnFraction_t prePaddingA[CUDNN_DIM_MAX] = {{0,1},{0,1}};
+    // cudnnFraction_t postPaddingA[CUDNN_DIM_MAX] = {{0,1},{0,1}};
+    // cudnnFraction_t strideA[CUDNN_DIM_MAX] = {{2,1},{2,1}};
+
+    // Initialise other attributes as int64_t (can also be cudnnFractionType as shown above)
+    int64_t windowDimA[CUDNN_DIM_MAX]   = {2, 2};
+    int64_t prePaddingA[CUDNN_DIM_MAX]  = {0, 0};
+    int64_t postPaddingA[CUDNN_DIM_MAX] = {0, 0};
+    int64_t strideA[CUDNN_DIM_MAX]      = {2, 2};
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<int8_t> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<int8_t> Y(Ysize, true);
+
+    Surface<float> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2] * bTensorDim[3], false);
+    Surface<float> S(sTensorDim[0] * sTensorDim[1] * sTensorDim[2] * sTensorDim[3], false);
+
+    run_pool_scale_bias_relu_int8(xTensorDim,
+                                  yTensorDim,
+                                  bTensorDim,
+                                  sTensorDim,
+                                  X.devPtr,
+                                  Y.devPtr,
+                                  B.devPtr,
+                                  S.devPtr,
+                                  compType,
+                                  nanOpt,
+                                  mode,
+                                  padding_mode,
+                                  nbSpatialDims,
+                                  alpha,
+                                  beta,
+                                  windowDimA,
+                                  prePaddingA,
+                                  postPaddingA,
+                                  strideA);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("MatmulBiasAct sample", "[frontend][fusion][MatmulBiasAct]") {
+    std::cout << "TEST_CASE :: Sample matmul bias runtime fusion code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample matmul bias runtime fusion code with backend API");
+    int64_t aTensorDim[] = {1, 64, 32};  // batch M K
+    int64_t bTensorDim[] = {1, 32, 64};  // batch K N
+    int64_t cTensorDim[] = {1, 64, 64};  // batch M N
+
+    int64_t zTensorDim[] = {1, 1, 64};  // bias
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "a matrix dims are " << aTensorDim[0] << ", " << aTensorDim[1] << ", " << aTensorDim[2] << std::endl;
+
+    std::cout << "b matrix dims are " << bTensorDim[0] << ", " << bTensorDim[1] << ", " << bTensorDim[2] << std::endl;
+
+    std::cout << "c matrix dims are " << cTensorDim[0] << ", " << cTensorDim[1] << ", " << cTensorDim[2] << std::endl;
+    int64_t Csize = cTensorDim[0] * cTensorDim[1] * cTensorDim[2];
+
+    Surface<half> A(aTensorDim[0] * aTensorDim[1] * aTensorDim[2], false);
+    Surface<half> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2], false);
+    Surface<half> C(Csize, true);
+
+    Surface<half> Z(zTensorDim[0] * zTensorDim[1] * zTensorDim[2], false);
+    Surface<half> AfterZ(Csize, false);
+
+    run_matmul_bias_gelu(aTensorDim,
+                         bTensorDim,
+                         cTensorDim,
+                         zTensorDim,
+                         CUDNN_DATA_HALF,
+                         A.devPtr,
+                         B.devPtr,
+                         C.devPtr,
+                         Z.devPtr,
+                         AfterZ.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(C.hostPtr, C.devPtr, (size_t)(sizeof(C.hostPtr[0]) * Csize), cudaMemcpyDeviceToHost));
+    checkCudaErr(
+        cudaMemcpy(AfterZ.hostPtr, AfterZ.devPtr, (size_t)(sizeof(AfterZ.hostPtr[0]) * Csize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("MatmulBiasAct sample_float", "[frontend][fusion][MatmulBiasAct]") {
+    std::cout << "TEST_CASE :: Sample matmul float runtime fusion code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample matmul float runtime fusion code with backend API");
+    int64_t aTensorDim[] = {1, 64, 32};  // batch M K
+    int64_t bTensorDim[] = {1, 32, 64};  // batch K N
+    int64_t cTensorDim[] = {1, 64, 64};  // batch M N
+
+    int64_t zTensorDim[] = {1, 1, 64};  // bias
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "a matrix dims are " << aTensorDim[0] << ", " << aTensorDim[1] << ", " << aTensorDim[2] << std::endl;
+
+    std::cout << "b matrix dims are " << bTensorDim[0] << ", " << bTensorDim[1] << ", " << bTensorDim[2] << std::endl;
+
+    std::cout << "c matrix dims are " << cTensorDim[0] << ", " << cTensorDim[1] << ", " << cTensorDim[2] << std::endl;
+    int64_t Csize = cTensorDim[0] * cTensorDim[1] * cTensorDim[2];
+
+    Surface<float> A(aTensorDim[0] * aTensorDim[1] * aTensorDim[2], false);
+    Surface<float> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2], false);
+    Surface<float> C(Csize, true);
+
+    Surface<float> Z(zTensorDim[0] * zTensorDim[1] * zTensorDim[2], false);
+    Surface<float> AfterZ(Csize, false);
+
+    run_matmul_bias_gelu(aTensorDim,
+                         bTensorDim,
+                         cTensorDim,
+                         zTensorDim,
+                         CUDNN_DATA_FLOAT,
+                         A.devPtr,
+                         B.devPtr,
+                         C.devPtr,
+                         Z.devPtr,
+                         AfterZ.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(C.hostPtr, C.devPtr, (size_t)(sizeof(C.hostPtr[0]) * Csize), cudaMemcpyDeviceToHost));
+    checkCudaErr(
+        cudaMemcpy(AfterZ.hostPtr, AfterZ.devPtr, (size_t)(sizeof(AfterZ.hostPtr[0]) * Csize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("MatmulDGeluDBias sample", "[frontend][fusion][MatmulDGeluDBias]") {
+    std::cout << "TEST_CASE :: Sample matmul DGelu Dbias runtime fusion code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample matmul DGelu Dbias runtime fusion code with backend API");
+
+    int64_t aTensorDim[] = {1, 2048, 1024};  // batch M K
+    int64_t bTensorDim[] = {1, 1024, 4096};  // batch K N
+    int64_t cTensorDim[] = {1, 2048, 4096};  // batch M N
+
+    int64_t zTensorDim[] = {1, 1, 4096};  // bias
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "a matrix dims are " << aTensorDim[0] << ", " << aTensorDim[1] << ", " << aTensorDim[2] << std::endl;
+
+    std::cout << "b matrix dims are " << bTensorDim[0] << ", " << bTensorDim[1] << ", " << bTensorDim[2] << std::endl;
+
+    std::cout << "c matrix dims are " << cTensorDim[0] << ", " << cTensorDim[1] << ", " << cTensorDim[2] << std::endl;
+
+    std::cout << "z matrix dims are " << zTensorDim[0] << ", " << zTensorDim[1] << ", " << zTensorDim[2] << std::endl;
+    int64_t Csize = cTensorDim[0] * cTensorDim[1] * cTensorDim[2];
+    int64_t Zsize = zTensorDim[0] * zTensorDim[1] * zTensorDim[2];
+
+    Surface<half> A(aTensorDim[0] * aTensorDim[1] * aTensorDim[2], false);
+    Surface<half> B(bTensorDim[0] * bTensorDim[1] * bTensorDim[2], false);
+    Surface<half> C(Csize, false);
+    Surface<half> dC(Csize, true);
+    Surface<float> dZ(Zsize, true);
+
+    run_matmul_dgelu_dbias(aTensorDim,
+                           bTensorDim,
+                           cTensorDim,
+                           zTensorDim,
+                           CUDNN_DATA_HALF,
+                           A.devPtr,
+                           B.devPtr,
+                           C.devPtr,
+                           dC.devPtr,
+                           dZ.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(dC.hostPtr, dC.devPtr, (size_t)(sizeof(dC.hostPtr[0]) * Csize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(dZ.hostPtr, dZ.devPtr, (size_t)(sizeof(dZ.hostPtr[0]) * Zsize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvDrelu sample", "[frontend][convDrelu][drelu]") {
+    std::cout << "TEST_CASE :: Sample conv drelu" << std::endl;
+    INFO("TEST_CASE :: Sample conv drelu");
+    int64_t xTensorDim[] = {4, 24, 48, 64};
+    int64_t wTensorDim[] = {32, 24, 3, 3};
+    int64_t yTensorDim[] = {0, 0, 0, 0};  // Computed Below
+    int64_t padding[]    = {1, 1};
+    int64_t dilation[]   = {1, 1};
+    int64_t convstride[] = {1, 1};
+
+    int64_t xTensorDim_padded[4];
+    int64_t yTensorDim_padded[4];
+    int64_t wTensorDim_padded[4];
+
+    yTensorDim[0] = xTensorDim[0];
+    yTensorDim[1] = wTensorDim[0];
+    for (int dim = 0; dim < 2; dim++) {
+        yTensorDim[dim + 2] =
+            getFwdConvOutputDim(xTensorDim[dim + 2], padding[dim], wTensorDim[dim + 2], convstride[dim], dilation[dim]);
+    }
+
+    for (int i = 0; i < 4; i++) {
+        xTensorDim_padded[i] = xTensorDim[i];
+        yTensorDim_padded[i] = yTensorDim[i];
+        wTensorDim_padded[i] = wTensorDim[i];
+    }
+
+    std::cout << "====PADDING DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim_padded[0] << ", " << xTensorDim_padded[1] << ", "
+              << xTensorDim_padded[2] << ", " << xTensorDim_padded[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim_padded[0] << ", " << wTensorDim_padded[1] << ", "
+              << wTensorDim_padded[2] << ", " << wTensorDim_padded[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim_padded[0] << ", " << yTensorDim_padded[1] << ", "
+              << yTensorDim_padded[2] << ", " << yTensorDim_padded[3] << std::endl;
+
+    int64_t Xsize = xTensorDim_padded[0] * xTensorDim_padded[1] * xTensorDim_padded[2] * xTensorDim_padded[3];
+    int64_t Ysize = yTensorDim_padded[0] * yTensorDim_padded[1] * yTensorDim_padded[2] * yTensorDim_padded[3];
+    int64_t Wsize = wTensorDim_padded[0] * wTensorDim_padded[1] * wTensorDim_padded[2] * wTensorDim_padded[3];
+
+    Surface<half> x_mem(Xsize, false);
+    Surface<half> w_mem(Wsize, false);
+    Surface<half> y_mem(Ysize, false);
+    Surface<half> extra_x_mem(Ysize, false);
+
+    run_conv_drelu(xTensorDim_padded,
+                   padding,
+                   convstride,
+                   dilation,
+                   wTensorDim_padded,
+                   yTensorDim_padded,
+                   CUDNN_DATA_HALF,
+                   x_mem.devPtr,
+                   w_mem.devPtr,
+                   y_mem.devPtr,
+                   extra_x_mem.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(y_mem.hostPtr, y_mem.devPtr, (size_t)(sizeof(y_mem.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("DgradDrelu sample", "[frontend][dgradDrelu][drelu]") {
+    std::cout << "TEST_CASE :: Sample dgrad drelu" << std::endl;
+    INFO("TEST_CASE :: Sample dgrad drelu");
+
+    int64_t xTensorDim[] = {4, 32, 32, 32};
+    int64_t wTensorDim[] = {32, 32, 3, 3};
+    int64_t yTensorDim[] = {0, 0, 0, 0};  // Computed Below
+    int64_t padding[]    = {0, 0};
+    int64_t dilation[]   = {1, 1};
+    int64_t convstride[] = {1, 1};
+
+    int64_t xTensorDim_padded[4];
+    int64_t yTensorDim_padded[4];
+    int64_t wTensorDim_padded[4];
+
+    yTensorDim[0] = xTensorDim[0];
+    yTensorDim[1] = wTensorDim[0];
+    for (int dim = 0; dim < 2; dim++) {
+        yTensorDim[dim + 2] =
+            getFwdConvOutputDim(xTensorDim[dim + 2], padding[dim], wTensorDim[dim + 2], convstride[dim], dilation[dim]);
+    }
+
+    for (int i = 0; i < 4; i++) {
+        xTensorDim_padded[i] = xTensorDim[i];
+        yTensorDim_padded[i] = yTensorDim[i];
+        wTensorDim_padded[i] = wTensorDim[i];
+    }
+
+    std::cout << "====PADDING DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim_padded[0] << ", " << xTensorDim_padded[1] << ", "
+              << xTensorDim_padded[2] << ", " << xTensorDim_padded[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim_padded[0] << ", " << wTensorDim_padded[1] << ", "
+              << wTensorDim_padded[2] << ", " << wTensorDim_padded[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim_padded[0] << ", " << yTensorDim_padded[1] << ", "
+              << yTensorDim_padded[2] << ", " << yTensorDim_padded[3] << std::endl;
+
+    int64_t Xsize = xTensorDim_padded[0] * xTensorDim_padded[1] * xTensorDim_padded[2] * xTensorDim_padded[3];
+    int64_t Ysize = yTensorDim_padded[0] * yTensorDim_padded[1] * yTensorDim_padded[2] * yTensorDim_padded[3];
+    int64_t Wsize = wTensorDim_padded[0] * wTensorDim_padded[1] * wTensorDim_padded[2] * wTensorDim_padded[3];
+
+    Surface<half> x_mem(Xsize, false);
+    Surface<half> w_mem(Wsize, false);
+    Surface<half> y_mem(Ysize, false);
+    Surface<half> extra_x_mem(Xsize, false);
+
+    run_dgrad_drelu(xTensorDim_padded,
+                    padding,
+                    convstride,
+                    dilation,
+                    wTensorDim_padded,
+                    yTensorDim_padded,
+                    CUDNN_DATA_HALF,
+                    x_mem.devPtr,
+                    w_mem.devPtr,
+                    y_mem.devPtr,
+                    extra_x_mem.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(x_mem.hostPtr, x_mem.devPtr, (size_t)(sizeof(x_mem.hostPtr[0]) * Xsize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("ConvColReduction sample", "[frontend][fusion][ConvColReduction]") {
+    std::cout << "TEST_CASE :: Sample conv column reductin add code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample conv column reductin add code with backend API");
+    int64_t xTensorDim[] = {32, 32, 7, 7};
+    int64_t wTensorDim[] = {256, 32, 1, 1};
+    int64_t yTensorDim[] = {32, 256, 7, 7};
+
+    int64_t conv_padA[]      = {0, 0};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t reducedTensorDim[] = {1, 256, 1, 1};  // output is NPQ * C reduced to C column
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << reducedTensorDim[0] << ", " << reducedTensorDim[1] << ", " << reducedTensorDim[2]
+              << ", " << reducedTensorDim[3] << std::endl;
+
+    int64_t outputSize = reducedTensorDim[0] * reducedTensorDim[1] * reducedTensorDim[2] * reducedTensorDim[3];
+
+    Surface<half> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<half> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<half> Y(yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3], false);
+
+    Surface<float> Reduced(outputSize, true);
+
+    run_conv_reduction(xTensorDim,
+                       wTensorDim,
+                       yTensorDim,
+                       reducedTensorDim,
+                       CUDNN_DATA_HALF,
+                       2,
+                       conv_padA,
+                       conv_dilationA,
+                       conv_strideA,
+                       X.devPtr,
+                       W.devPtr,
+                       Reduced.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(
+        Reduced.hostPtr, Reduced.devPtr, (size_t)(sizeof(Reduced.hostPtr[0]) * outputSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use errata to block global(index) for execution", "[frontend][errata][wgrad]") {
+    std::cout << "TEST_CASE :: Use  errata to block a global index for engine generation" << std::endl;
+    INFO("TEST_CASE :: Use  errata to block global index for engine generation");
+    int64_t dimA[]        = {1, 32, 128, 128};
+    int64_t filterdimA[]  = {32, 32, 3, 3};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm(Xsize, Wsize, Ysize, Wsize);
+
+    block_using_errata(dimA,
+                       padA,
+                       convstrideA,
+                       dilationA,
+                       filterdimA,
+                       outdimA,
+                       CUDNN_DATA_FLOAT,
+                       mode,
+                       sm.devPtrX,
+                       sm.devPtrW,
+                       sm.devPtrY);
+
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("DP4A execution with cudnnFindPlan", "[frontend][cudnnFindPlan][conv]") {
+    std::cout << "TEST_CASE :: Use cudnnFindPlan for plan generation" << std::endl;
+    INFO("TEST_CASE :: Use cudnnFindPlan for plan generation");
+    int64_t vectorCount     = 4;
+    int64_t vectorDimension = 1;
+    int64_t dimA[]          = {4, 16, 224, 224};
+    int64_t filterdimA[]    = {64, 16, 3, 3};
+    int64_t outdimA[]       = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]          = {0, 0};
+    int64_t dilationA[]     = {1, 1};
+    int64_t convstrideA[]   = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0] / vectorCount;
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = vectorCount * dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = vectorCount * filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = vectorCount * outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<int8_t> sm(Xsize, Wsize, Ysize, Ysize);
+
+    run_dp4a(dimA,
+             padA,
+             convstrideA,
+             dilationA,
+             filterdimA,
+             outdimA,
+             mode,
+             sm.devPtrX,
+             sm.devPtrW,
+             sm.devPtrY,
+             vectorCount,
+             vectorDimension);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("IMMA execution with manual autotuning", "[frontend][cudnnGetPlan][conv]") {
+    std::cout << "TEST_CASE :: Use manual autotuning for plan generation" << std::endl;
+    INFO("TEST_CASE :: Use manual autotuning for plan generation");
+    int64_t vectorCount     = 32;
+    int64_t vectorDimension = 1;
+    int64_t dimA[]          = {7, 64 / 32, 21, 21};
+    int64_t filterdimA[]    = {32, 64 / 32, 3, 3};
+    int64_t outdimA[]       = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]          = {0, 0};
+    int64_t dilationA[]     = {1, 1};
+    int64_t convstrideA[]   = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0] / vectorCount;
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = vectorCount * dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = vectorCount * filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = vectorCount * outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<int8_t> sm(Xsize, Wsize, Ysize, Ysize);
+
+    run_imma(dimA,
+             padA,
+             convstrideA,
+             dilationA,
+             filterdimA,
+             outdimA,
+             mode,
+             sm.devPtrX,
+             sm.devPtrW,
+             sm.devPtrY,
+             vectorCount,
+             vectorDimension);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm.hostY, sm.devPtrY, (size_t)(sizeof(sm.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Use Plan cache for rerunning the same convolution", "[frontend][dnn_heuristics][conv]") {
+    std::cout << "Use Plan cache for rerunning the same convolution" << std::endl;
+    INFO("Use Plan cache for rerunning the same convolution");
+    int64_t dimA[]        = {8, 32, 4, 4};
+    int64_t filterdimA[]  = {32, 32, 1, 1};
+    int64_t outdimA[]     = {0, 0, 0, 0};  // Computed Below
+    int64_t padA[]        = {0, 0};
+    int64_t dilationA[]   = {1, 1};
+    int64_t convstrideA[] = {1, 1};
+
+    int numErrors = 0;
+
+    outdimA[0] = dimA[0];
+    outdimA[1] = filterdimA[0];
+    for (int dim = 0; dim < 2; dim++) {
+        outdimA[dim + 2] =
+            getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA[dim + 2], convstrideA[dim], dilationA[dim]);
+    }
+
+    cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
+
+    std::cout << "====DIMENSIONS====\n";
+    std::cout << "input dims are " << dimA[0] << ", " << dimA[1] << ", " << dimA[2] << ", " << dimA[3] << "\n";
+    std::cout << "filter dims are " << filterdimA[0] << ", " << filterdimA[1] << ", " << filterdimA[2] << ", "
+              << filterdimA[3] << "\n";
+    std::cout << "output dims are " << outdimA[0] << ", " << outdimA[1] << ", " << outdimA[2] << ", " << outdimA[3]
+              << "\n";
+
+    int64_t Xsize = dimA[0] * dimA[1] * dimA[2] * dimA[3];
+    int64_t Wsize = filterdimA[0] * filterdimA[1] * filterdimA[2] * filterdimA[3];
+    int64_t Ysize = outdimA[0] * outdimA[1] * outdimA[2] * outdimA[3];
+
+    SurfaceManager<float> sm_0(Xsize, Wsize, Ysize, Ysize);
+    SurfaceManager<float> sm_1(Xsize, Wsize, Ysize, Ysize);
+
+    // In the first call the plan is derived from heuristics.
+    run_from_heuristics(dimA,
+                        padA,
+                        convstrideA,
+                        dilationA,
+                        filterdimA,
+                        outdimA,
+                        CUDNN_DATA_FLOAT,
+                        mode,
+                        sm_0.devPtrX,
+                        sm_0.devPtrW,
+                        sm_0.devPtrY,
+                        CUDNN_HEUR_MODE_B);
+
+    // In the second call the plan is expected to be in the cache
+    run_from_heuristics(dimA,
+                        padA,
+                        convstrideA,
+                        dilationA,
+                        filterdimA,
+                        outdimA,
+                        CUDNN_DATA_FLOAT,
+                        mode,
+                        sm_0.devPtrX,
+                        sm_0.devPtrW,
+                        sm_1.devPtrY,
+                        CUDNN_HEUR_MODE_B,
+                        true);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(sm_0.hostY, sm_0.devPtrY, (size_t)(sizeof(sm_0.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(sm_1.hostY, sm_1.devPtrY, (size_t)(sizeof(sm_1.hostY[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    conv_cpu_ref<float, float>(sm_0.hostX,
+                               sm_0.hostW,
+                               sm_0.host_ref,
+                               1,
+                               CUDNN_TENSOR_NCHW,
+                               dimA,
+                               filterdimA,
+                               outdimA,
+                               convstrideA,
+                               padA,
+                               dilationA,
+                               4 /*Dims*/);
+
+    for (size_t index = 0; index < (size_t)Ysize; index++) {  // assuming in data is packed
+        float diff = getError(sm_0.hostY[index], sm_0.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+        diff = getError(sm_1.hostY[index], sm_0.host_ref[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+    REQUIRE(numErrors == 0);
+}
+
+TEST_CASE("Scale Bias Conv BNGenstats", "[frontend][fusion][bn_genstas]") {
+    std::cout << "Scale Bias Conv BNGenstats" << std::endl;
+    int64_t perChannelScaleDim[] = {1, 32, 1, 1};
+    int64_t perChannelBiasDim[]  = {1, 32, 1, 1};
+    int64_t xTensorDim[]         = {32, 32, 7, 7};
+    int64_t wTensorDim[]         = {256, 32, 1, 1};
+    int64_t yTensorDim[]         = {32, 256, 7, 7};
+    int64_t sumTensorDim[]       = {1, 256, 1, 1};
+    int64_t sqSumTensorDim[]     = {1, 256, 1, 1};
+
+    int64_t conv_padA[]      = {0, 0};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    Surface<half> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<half> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<half> Y(yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3], false);
+
+    Surface<half> scale(perChannelScaleDim[0] * perChannelScaleDim[1] * perChannelScaleDim[2] * perChannelScaleDim[3],
+                        false);
+    Surface<half> bias(perChannelBiasDim[0] * perChannelBiasDim[1] * perChannelBiasDim[2] * perChannelBiasDim[3],
+                       false);
+
+    Surface<float> sum(sumTensorDim[0] * sumTensorDim[1] * sumTensorDim[2] * sumTensorDim[3], false);
+    Surface<float> sqSum(sqSumTensorDim[0] * sqSumTensorDim[1] * sqSumTensorDim[2] * sqSumTensorDim[3], false);
+
+    run_bn_conv_gen_stat(xTensorDim,
+                         wTensorDim,
+                         yTensorDim,
+                         perChannelScaleDim,
+                         2,
+                         conv_padA,
+                         conv_dilationA,
+                         conv_strideA,
+                         X.devPtr,
+                         W.devPtr,
+                         Y.devPtr,
+                         scale.devPtr,
+                         bias.devPtr,
+                         sum.devPtr,
+                         sqSum.devPtr);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Dual Scale Bias Act Relu", "[frontend][fusion][DSBAR]") {
+    std::cout << "Dual Scale Bias Act Relu" << std::endl;
+    int64_t perChannelScaleDim[] = {1, 32, 1, 1};
+    int64_t yTensorDim[]         = {32, 32, 7, 7};
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    Surface<half> RP_Y(Ysize, false);
+    Surface<half> DP_Y(Ysize, false);
+    Surface<half> finalY(Ysize, false);
+
+    int64_t scaleSize = perChannelScaleDim[0] * perChannelScaleDim[1] * perChannelScaleDim[2] * perChannelScaleDim[3];
+
+    Surface<float> RP_scale(scaleSize, false);
+    Surface<float> RP_bias(scaleSize, false);
+
+    Surface<float> DP_scale(scaleSize, false);
+    Surface<float> DP_bias(scaleSize, false);
+
+    run_dsbar(yTensorDim,
+              perChannelScaleDim,
+              RP_Y.devPtr,
+              RP_scale.devPtr,
+              RP_bias.devPtr,
+              DP_Y.devPtr,
+              DP_scale.devPtr,
+              DP_bias.devPtr,
+              finalY.devPtr,
+              CUDNN_DATA_HALF);
+}
+
+TEST_CASE("Dual Scale Bias Act Relu with CPU Reference", "[frontend][fusion][DSBAR][CPU]") {
+    std::cout << "\n========================================================================================\n";
+    std::cout << "Dual Scale Bias Act Relu with CPU Reference" << std::endl;
+    int64_t perChannelScaleDim[] = {1, 32, 1, 1};
+    int64_t yTensorDim[]         = {32, 32, 7, 7};
+
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    Surface<half> RP_Y(Ysize, true);
+    Surface<half> DP_Y(Ysize, true);
+    Surface<float> finalY(Ysize, true);
+
+    int64_t scaleSize = perChannelScaleDim[0] * perChannelScaleDim[1] * perChannelScaleDim[2] * perChannelScaleDim[3];
+
+    Surface<float> RP_scale(scaleSize, true);
+    Surface<float> RP_bias(scaleSize, true);
+
+    Surface<float> DP_scale(scaleSize, true);
+    Surface<float> DP_bias(scaleSize, true);
+
+    cudnnStatus_t status = run_dsbar(yTensorDim,
+                                     perChannelScaleDim,
+                                     RP_Y.devPtr,
+                                     RP_scale.devPtr,
+                                     RP_bias.devPtr,
+                                     DP_Y.devPtr,
+                                     DP_scale.devPtr,
+                                     DP_bias.devPtr,
+                                     finalY.devPtr,
+                                     CUDNN_DATA_FLOAT);
+
+    if (status != CUDNN_STATUS_SUCCESS) {
+        std::cout << "Error in Dual Scale Bias Act Relu with CPU" << std::endl;
+        return;
+    }
+
+    int numErrors = 0;
+
+#if (CUDNN_VERSION >= 8301)
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(finalY.hostPtr, finalY.devPtr, (size_t)(sizeof(finalY.devPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    Surface<float> RP_afterScaleBias(Ysize, true);
+    Surface<float> DP_afterScaleBias(Ysize, true);
+    Surface<float> finalY_afterAdd(Ysize, true);
+    Surface<float> finalY_cpu(Ysize, true);
+
+    // RP_afterScaleBias = RP_scale * RP_Y + RP_bias
+    scale_and_bias_tensor_cpu(
+        RP_Y.hostPtr, RP_afterScaleBias.hostPtr, RP_scale.hostPtr, RP_bias.hostPtr, Ysize, yTensorDim);
+
+    // DP_afterScaleBias = DP_scale * DP_Y + DP_bias
+    scale_and_bias_tensor_cpu(
+        DP_Y.hostPtr, DP_afterScaleBias.hostPtr, DP_scale.hostPtr, DP_bias.hostPtr, Ysize, yTensorDim);
+
+    // finalY_afterAdd = RP_afterScaleBias + DP_afterScaleBias
+    add_tensors_cpu<float>(RP_afterScaleBias.hostPtr, DP_afterScaleBias.hostPtr, finalY_afterAdd.hostPtr, Ysize);
+
+    // finalY = relu(finalY_afterAdd)
+    relu<float>(finalY_afterAdd.hostPtr, finalY_cpu.hostPtr, Ysize);
+
+    for (int index = 0; index < Ysize; index++) {  // assuming in data is packed
+        float diff = getError(finalY.hostPtr[index], finalY_cpu.hostPtr[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+
+#endif
+    REQUIRE(numErrors == 0);
+}
+
+TEST_CASE("Scale Bias Conv BNGenstats with CPU Reference", "[frontend][fusion][bn_genstats][cpu]") {
+    std::cout << "\n========================================================================================\n";
+    std::cout << "Scale Bias Conv BNGenstats with CPU Reference" << std::endl;
+
+    if (!is_ampere_arch() && !is_hopper_arch()) {
+        SKIP("Scale Bias Conv BNGenstats requires Ampere or Hopper");
+    }
+
+    int64_t perChannelScaleDim[] = {1, 32, 1, 1};
+    int64_t perChannelBiasDim[]  = {1, 32, 1, 1};
+    int64_t xTensorDim[]         = {32, 32, 7, 7};
+    int64_t wTensorDim[]         = {256, 32, 1, 1};
+    int64_t yTensorDim[]         = {32, 256, 7, 7};
+    int64_t sumTensorDim[]       = {1, 256, 1, 1};
+    int64_t sqSumTensorDim[]     = {1, 256, 1, 1};
+
+    int64_t conv_padA[]      = {0, 0};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t Xsize     = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t Wsize     = wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3];
+    int64_t Ysize     = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    int64_t Bsize     = perChannelBiasDim[0] * perChannelBiasDim[1] * perChannelBiasDim[2] * perChannelBiasDim[3];
+    int64_t Ssize     = perChannelScaleDim[0] * perChannelScaleDim[1] * perChannelScaleDim[2] * perChannelScaleDim[3];
+    int64_t Sumsize   = sumTensorDim[0] * sumTensorDim[1] * sumTensorDim[2] * sumTensorDim[3];
+    int64_t SqSumsize = sqSumTensorDim[0] * sqSumTensorDim[1] * sqSumTensorDim[2] * sqSumTensorDim[3];
+
+    Surface<half> X(Xsize, true);
+    Surface<half> W(Wsize, true);
+    Surface<half> Y(Ysize, true);
+
+    Surface<half> scale(Ssize, true);
+    Surface<half> bias(Bsize, true);
+
+    Surface<float> sum(Sumsize, true);
+    Surface<float> sqSum(SqSumsize, true);
+
+    cudnnStatus_t status = run_bn_conv_gen_stat(xTensorDim,
+                                                wTensorDim,
+                                                yTensorDim,
+                                                perChannelScaleDim,
+                                                2,
+                                                conv_padA,
+                                                conv_dilationA,
+                                                conv_strideA,
+                                                X.devPtr,
+                                                W.devPtr,
+                                                Y.devPtr,
+                                                scale.devPtr,
+                                                bias.devPtr,
+                                                sum.devPtr,
+                                                sqSum.devPtr);
+
+    if (status != CUDNN_STATUS_SUCCESS) {
+        std::cout << "BN Conv Gen Stat failed" << std::endl;
+        return;
+    }
+
+    int numErrors           = 0;
+    int normalizationErrors = 0;
+
+#if (CUDNN_VERSION >= 8301)
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.devPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    Surface<float> afterScaleBiasTensor(Ysize, true);
+    Surface<half> afterConvTensor(Ysize, true);
+    Surface<half> afterReluTensor(Ysize, true);
+    Surface<half> afterBNTensor(Ysize, true);
+
+    // Vector of pairs of mean and variance for each batch
+    std::vector<std::pair<float, float>> stats((size_t)Sumsize);
+
+    // Scale -> Bias
+    scale_and_bias_tensor_cpu<half>(
+        X.hostPtr, afterScaleBiasTensor.hostPtr, scale.hostPtr, bias.hostPtr, Xsize, xTensorDim);
+
+    // Activation
+    relu<half>(afterScaleBiasTensor.hostPtr, afterReluTensor.hostPtr, Ysize);
+
+    // Conv
+    conv_cpu_ref<half, float>(afterReluTensor.hostPtr,
+                              W.hostPtr,
+                              afterConvTensor.hostPtr,
+                              1,
+                              CUDNN_TENSOR_NHWC,
+                              xTensorDim,
+                              wTensorDim,
+                              yTensorDim,
+                              conv_strideA,
+                              conv_padA,
+                              conv_dilationA,
+                              4 /*Dims*/);
+
+    // Gen stats
+    gen_stats_cpu(afterConvTensor.hostPtr, stats, Ysize, yTensorDim);
+
+    batch_normalize(afterConvTensor.hostPtr, afterBNTensor.hostPtr, stats, Ysize, yTensorDim);
+
+    std::vector<std::pair<float, float>> after_normalization((size_t)Sumsize);
+
+    gen_stats_cpu(afterBNTensor.hostPtr, after_normalization, Ysize, yTensorDim);
+
+    for (int index = 0; index < Ysize; index++) {  // assuming in data is packed
+        float diff = getError(Y.hostPtr[index], afterConvTensor.hostPtr[index]);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+
+    for (int index = 0; index < Sumsize; index++) {
+        // Data should have 0 mean
+        float diff = getError(0, after_normalization[index].first);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            normalizationErrors++;
+        }
+
+        // Data should have 1 variance
+        diff = getError(1, after_normalization[index].second);
+        if (diff < 0) diff = -diff;
+        if (diff > 2 * THRESHOLD) {
+            normalizationErrors++;
+        }
+    }
+
+#endif
+    REQUIRE(numErrors == 0);
+    REQUIRE(normalizationErrors == 0);
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("BN Finalize", "[frontend][fusion][bn_finalize]") {
+    std::cout << "\n========================================================================================\n";
+    std::cout << "BN Finalize" << std::endl;
+    // This  example shows CUDNN_BN_FINALIZE_STATISTICS_TRAINING
+    // For CUDNN_BN_FINALIZE_STATISTICS_INFERENCE,
+    // Input Statistics like ySum, ySqSum,
+    // And output statistics like modified mean, Inv variance and AccumulationCount.
+
+    // Here Channel count is output channel.
+
+    int64_t perChannelSum[]   = {1, 32, 1, 1};
+    int64_t perChannelSqSum[] = {1, 32, 1, 1};
+    int64_t bnScale[]         = {1, 32, 1, 1};  // BN Scale gamma
+    int64_t bnBias[]          = {1, 32, 1, 1};  // BN bias beta
+
+    int64_t inputRunningMean[]   = {1, 32, 1, 1};
+    int64_t inputRunningVar[]    = {1, 32, 1, 1};
+    int64_t updatedRunningMean[] = {1, 32, 1, 1};
+    int64_t updatedRunningVar[]  = {1, 32, 1, 1};
+
+    int64_t bnSavedMean[]   = {1, 32, 1, 1};  // Required for backward path
+    int64_t bnSavedInvVar[] = {1, 32, 1, 1};  // Required for backward path
+
+    int64_t eqScaleNext[] = {1, 32, 1, 1};  // (gamma / ((var + epsilon) ^ 1/2))
+    int64_t eqBiasNext[]  = {1, 32, 1, 1};  // (beta - mu/((var + epsilon) ^ 1/2))
+
+    int64_t epsilon[] = {1, 1, 1, 1};
+
+    auto size_calculator = [](int64_t* arr) {
+        return std::accumulate(arr, arr + 4, static_cast<int64_t>(1), std::multiplies<int64_t>());
+    };
+
+    Surface<float> YSum(size_calculator(perChannelSum), false);
+    Surface<float> YSqSum(size_calculator(perChannelSqSum), false);
+
+    Surface<float> scale(size_calculator(bnScale), false);
+    Surface<float> bias(size_calculator(bnBias), false);
+
+    Surface<float> in_mean(size_calculator(inputRunningMean), false);
+    Surface<float> in_var(size_calculator(inputRunningVar), false);
+    Surface<float> out_mean(size_calculator(updatedRunningMean), false);
+    Surface<float> out_var(size_calculator(updatedRunningVar), false);
+    Surface<float> saved_mean(size_calculator(bnSavedMean), false);
+    Surface<float> saved_inv_var(size_calculator(bnSavedInvVar), false);
+
+    Surface<float> eq_scale(size_calculator(eqScaleNext), false);
+    Surface<float> eq_bias(size_calculator(eqBiasNext), false);
+
+    double epsilon_val         = 0.05;
+    double expAverageFactorVal = 0.9;
+    int64_t accumCntVal        = 25;
+
+    // Just passing perChannelSum as proxy for all the 1,K,1,1 tensors
+    run_bn_finalize(perChannelSum,
+                    epsilon,
+                    YSum.devPtr,
+                    YSqSum.devPtr,
+                    scale.devPtr,
+                    bias.devPtr,
+                    in_mean.devPtr,
+                    in_var.devPtr,
+                    out_mean.devPtr,
+                    out_var.devPtr,
+                    saved_mean.devPtr,
+                    saved_inv_var.devPtr,
+                    eq_scale.devPtr,
+                    eq_bias.devPtr,
+                    epsilon_val,
+                    expAverageFactorVal,
+                    accumCntVal);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("Tensor cloning", "[frontend][comparison][clone]") {
+    // Consider creation of a 2d Tensor
+    // n,c,h,w as 4,32,32,32
+    std::cout << "Tensor cloning comparison" << std::endl;
+    std::array<int64_t, 4> tensor_dim = {4, 32, 32, 32};
+    std::array<int64_t, 4> tensor_str = {32768, 1024, 32, 1};  // NCHW format
+    cudnnDataType_t data_type         = CUDNN_DATA_FLOAT;
+    int64_t alignment                 = sizeof(float);
+    int64_t id                        = 0xD0D0CACA;  // Some magic number
+    int64_t new_id                    = 4;           // Some other magic number
+
+    SECTION("Clone tensor, all params the same besides UID") {
+        std::cout << "Clone tensor, all params the same besides UID" << std::endl;
+        try {
+            auto tensor = cudnn_frontend::TensorBuilder()
+                              .setDim(tensor_dim.size(), tensor_dim.data())
+                              .setStrides(tensor_str.size(), tensor_str.data())
+                              .setId(id)
+                              .setAlignment(alignment)
+                              .setDataType(data_type)
+                              .build();
+
+            auto clone_tensor = cudnn_frontend::TensorBuilder().cloneFrom(tensor, new_id).build();
+
+            // Clone id should not be same as original
+            REQUIRE(tensor.getId() == id);
+            REQUIRE(clone_tensor.getId() == new_id);
+
+            // Checking if the clone is equal to the original
+            REQUIRE(tensor.getAlignment() == clone_tensor.getAlignment());
+            REQUIRE(tensor.getPackedElementCount() == clone_tensor.getPackedElementCount());
+            REQUIRE(tensor.getDimensionCount() == clone_tensor.getDimensionCount());
+            REQUIRE(tensor.isVirtualTensor() == clone_tensor.isVirtualTensor());
+
+            int numDimErrors = 0;
+
+            const int64_t* tensor_dim_ptr       = tensor.getDimArray();
+            const int64_t* clone_tensor_dim_ptr = clone_tensor.getDimArray();
+
+            for (size_t i = 0; i < tensor_dim.size(); i++) {
+                if (tensor_dim_ptr[i] != clone_tensor_dim_ptr[i]) {
+                    numDimErrors++;
+                }
+            }
+            REQUIRE(numDimErrors == 0);
+
+            int numStrErrors                    = 0;
+            const int64_t* tensor_str_ptr       = tensor.getStrideArray();
+            const int64_t* clone_tensor_str_ptr = clone_tensor.getStrideArray();
+
+            for (size_t i = 0; i < tensor_str.size(); i++) {
+                if (tensor_str_ptr[i] != clone_tensor_str_ptr[i]) {
+                    numStrErrors++;
+                }
+            }
+            REQUIRE(numStrErrors == 0);
+
+            REQUIRE(tensor.getDataType() == clone_tensor.getDataType());
+
+        } catch (cudnn_frontend::cudnnException& e) {
+            std::cout << "Exception in tensor creation " << e.what() << std::endl;
+        }
+    }
+
+    SECTION("Clone tensor, all params the same besides UID, virtualness, and data type") {
+        std::cout << "Clone tensor, all params the same besides UID, virtualness, and data type" << std::endl;
+        try {
+            auto tensor = cudnn_frontend::TensorBuilder()
+                              .setDim(tensor_dim.size(), tensor_dim.data())
+                              .setStrides(tensor_str.size(), tensor_str.data())
+                              .setId(id)
+                              .setAlignment(alignment)
+                              .setDataType(data_type)
+                              .build();
+
+            // Clone the original tensor, but make this tensor HALF type as well as virutal
+            auto clone_tensor = cudnn_frontend::TensorBuilder()
+                                    .cloneFrom(tensor, new_id)
+                                    .setDataType(CUDNN_DATA_HALF)
+                                    .setVirtual()
+                                    .build();
+
+            // Clone id should not be same as original
+            REQUIRE(tensor.getId() == id);
+            REQUIRE(clone_tensor.getId() == new_id);
+
+            // Checking if the clone is equal to the original
+            REQUIRE(tensor.getAlignment() == clone_tensor.getAlignment());
+            REQUIRE(tensor.getPackedElementCount() == clone_tensor.getPackedElementCount());
+            REQUIRE(tensor.getDimensionCount() == clone_tensor.getDimensionCount());
+
+            // Original tensor should not be virtual, clone tensor should be virtual
+            REQUIRE(tensor.isVirtualTensor() != clone_tensor.isVirtualTensor());
+
+            int numDimErrors = 0;
+
+            const int64_t* tensor_dim_ptr       = tensor.getDimArray();
+            const int64_t* clone_tensor_dim_ptr = clone_tensor.getDimArray();
+
+            for (size_t i = 0; i < tensor_dim.size(); i++) {
+                if (tensor_dim_ptr[i] != clone_tensor_dim_ptr[i]) {
+                    numDimErrors++;
+                }
+            }
+            REQUIRE(numDimErrors == 0);
+
+            int numStrErrors                    = 0;
+            const int64_t* tensor_str_ptr       = tensor.getStrideArray();
+            const int64_t* clone_tensor_str_ptr = clone_tensor.getStrideArray();
+
+            for (size_t i = 0; i < tensor_str.size(); i++) {
+                if (tensor_str_ptr[i] != clone_tensor_str_ptr[i]) {
+                    numStrErrors++;
+                }
+            }
+            REQUIRE(numStrErrors == 0);
+
+            // Original tensor should be float, clone data type should be half
+            REQUIRE(tensor.getDataType() != clone_tensor.getDataType());
+
+        } catch (cudnn_frontend::cudnnException& e) {
+            std::cout << "Exception in tensor creation " << e.what() << std::endl;
+        }
+    }
+}
+
+#if (CUDNN_VERSION >= 8600)
+TEST_CASE("Max pooling idx tensor dump", "[pooling][forward][max_pooling]") {
+    std::cout << "TEST_CASE Max pooling :: Sample max pooling with idx tensor" << std::endl;
+    INFO("TEST_CASE :: Sample max pooling with idx tensor");
+
+    int64_t xTensorDim[] = {1, 64, 112, 112};
+    int64_t yTensorDim[] = {1, 64, 56, 56};
+
+    cudnnDataType_t tensorType = CUDNN_DATA_HALF;
+    int32_t nbSpatialDims      = 2;
+
+    /* Shape attributes
+     * There are two parameter types viz., int64_t and cudnnFractiontype_t that are supported for the below attributes
+     * Both types are interchangeable
+     * cudnnFractionType_t can be used for modes that require non integer parameters(e.g., adaptive pooling )
+     * */
+    // Illustration: Initiliase the windowDimA as cudnnFractionType {numerator, denoniminator}
+    // cudnnFraction_t windowDimA[] = {{2,1},{2,1}};
+    // cudnnFraction_t prePaddingA[] = {{0,1},{0,1}};
+    // cudnnFraction_t postPaddingA[] = {{0,1},{0,1}};
+    // cudnnFraction_t strideA[] = {{2,1},{2,1}};
+
+    // Initialise other attributes as int64_t (can also be cudnnFractionType as shown above)
+    int64_t windowDimA[]   = {3, 3};
+    int64_t prePaddingA[]  = {1, 1};
+    int64_t postPaddingA[] = {1, 1};
+    int64_t strideA[]      = {2, 2};
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "y dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t Xsize = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t Ysize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+
+    Surface<half> X(Xsize, false);
+    Surface<half> Y(Ysize, false);
+    Surface<int8_t> idx(Ysize, false);
+
+    // Sampling params
+    auto const nanOpt                                = CUDNN_PROPAGATE_NAN;
+    cudnn_frontend::ResampleMode_t const mode        = cudnn_frontend::ResampleMode_t::MAXPOOL;
+    cudnn_frontend::PaddingMode_t const padding_mode = cudnn_frontend::PaddingMode_t::NEG_INF_PAD;
+
+    run_maxpool_with_idx(xTensorDim,
+                         yTensorDim,
+                         yTensorDim,  // idx tensor dim same as dy tensor dim
+                         X.devPtr,
+                         Y.devPtr,
+                         idx.devPtr,
+                         tensorType,
+                         nanOpt,
+                         mode,
+                         padding_mode,
+                         nbSpatialDims,
+                         windowDimA,
+                         prePaddingA,
+                         postPaddingA,
+                         strideA);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(idx.hostPtr, idx.devPtr, (size_t)(sizeof(idx.hostPtr[0]) * Ysize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    int64_t max_idx = windowDimA[0] * windowDimA[1];
+    int num_errors  = 0;
+    for (size_t i = 0; i < (size_t)Ysize; i++) {
+        if (idx.hostPtr[i] >= max_idx) {
+            num_errors++;
+        }
+    }
+    REQUIRE(num_errors == 0);
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8600)
+TEST_CASE("Backward pooling", "[pooling][backward][max_pooling]") {
+    std::cout << "TEST_CASE Backward pooling :: Sample Backward max and average pooling" << std::endl;
+    INFO("TEST_CASE :: Sample Backward max and average pooling");
+
+    int64_t dxTensorDim[] = {1, 16, 20, 20};
+    int64_t dyTensorDim[] = {1, 16, 10, 10};
+
+    cudnnDataType_t tensorType = CUDNN_DATA_HALF;
+    int32_t nbSpatialDims      = 2;
+
+    /* Shape attributes
+     * There are two parameter types viz., int64_t and cudnnFractiontype_t that are supported for the below attributes
+     * Both types are interchangeable
+     * cudnnFractionType_t can be used for modes that require non integer parameters(e.g., adaptive pooling )
+     * */
+    // Illustration: Initiliase the windowDimA as cudnnFractionType {numerator, denoniminator}
+    // cudnnFraction_t windowDimA[] = {{2,1},{2,1}};
+    // cudnnFraction_t prePaddingA[] = {{0,1},{0,1}};
+    // cudnnFraction_t postPaddingA[] = {{0,1},{0,1}};
+    // cudnnFraction_t strideA[] = {{2,1},{2,1}};
+
+    // Initialise other attributes as int64_t (can also be cudnnFractionType as shown above)
+    int64_t windowDimA[]   = {2, 2};
+    int64_t prePaddingA[]  = {0, 0};
+    int64_t postPaddingA[] = {0, 0};
+    int64_t strideA[]      = {2, 2};
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+
+    std::cout << "dx dims are " << dxTensorDim[0] << ", " << dxTensorDim[1] << ", " << dxTensorDim[2] << ", "
+              << dxTensorDim[3] << std::endl;
+
+    std::cout << "dy dims are " << dyTensorDim[0] << ", " << dyTensorDim[1] << ", " << dyTensorDim[2] << ", "
+              << dyTensorDim[3] << std::endl;
+
+    int64_t dXsize = dxTensorDim[0] * dxTensorDim[1] * dxTensorDim[2] * dxTensorDim[3];
+    int64_t dYsize = dyTensorDim[0] * dyTensorDim[1] * dyTensorDim[2] * dyTensorDim[3];
+
+    Surface<half> dX(dXsize, false);
+    Surface<half> dY(dYsize, false);
+
+    SECTION("Backward average pooling") {
+        std::cout << "BACKWARD AVERAGE POOLING" << std::endl;
+
+        // Sampling params
+        auto const nanOpt                                = CUDNN_PROPAGATE_NAN;
+        cudnn_frontend::ResampleMode_t const mode        = cudnn_frontend::ResampleMode_t::AVGPOOL_INCLUDE_PADDING;
+        cudnn_frontend::PaddingMode_t const padding_mode = cudnn_frontend::PaddingMode_t::ZERO_PAD;
+
+        run_backward_avgpool(dxTensorDim,
+                             dyTensorDim,
+                             dX.devPtr,
+                             dY.devPtr,
+                             tensorType,
+                             nanOpt,
+                             mode,
+                             padding_mode,
+                             nbSpatialDims,
+                             windowDimA,
+                             prePaddingA,
+                             postPaddingA,
+                             strideA);
+
+        checkCudaErr(cudaDeviceSynchronize());
+        checkCudaErr(
+            cudaMemcpy(dX.hostPtr, dX.devPtr, (size_t)(sizeof(dX.hostPtr[0]) * dXsize), cudaMemcpyDeviceToHost));
+        checkCudaErr(cudaDeviceSynchronize());
+        std::cout << "\n========================================================================================\n";
+    }
+
+    SECTION("Backward max pooling") {
+        std::cout << "BACKWARD MAX POOLING" << std::endl;
+
+        Surface<int8_t> idx(dYsize, false);
+
+        int64_t max_idx = windowDimA[0] * windowDimA[1];
+
+        for (size_t i = 0; i < (size_t)dYsize; i++) {
+            // Random idx between 0 and max_idx
+            idx.hostPtr[i] = (int8_t)(rand() % max_idx);
+        }
+
+        checkCudaErr(
+            cudaMemcpy(idx.devPtr, idx.hostPtr, (size_t)(sizeof(idx.hostPtr[0]) * dYsize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaDeviceSynchronize());
+
+        // Sampling params
+        auto const nanOpt                                = CUDNN_PROPAGATE_NAN;
+        cudnn_frontend::ResampleMode_t const mode        = cudnn_frontend::ResampleMode_t::MAXPOOL;
+        cudnn_frontend::PaddingMode_t const padding_mode = cudnn_frontend::PaddingMode_t::NEG_INF_PAD;
+
+        run_backward_maxpool(dxTensorDim,
+                             dyTensorDim,
+                             dyTensorDim,  // idx tensor dim same as dy tensor dim
+                             dX.devPtr,
+                             dY.devPtr,
+                             idx.devPtr,
+                             tensorType,
+                             nanOpt,
+                             mode,
+                             padding_mode,
+                             nbSpatialDims,
+                             windowDimA,
+                             prePaddingA,
+                             postPaddingA,
+                             strideA);
+
+        checkCudaErr(cudaDeviceSynchronize());
+        checkCudaErr(
+            cudaMemcpy(dX.hostPtr, dX.devPtr, (size_t)(sizeof(dX.hostPtr[0]) * dXsize), cudaMemcpyDeviceToHost));
+        checkCudaErr(cudaDeviceSynchronize());
+        std::cout << "\n========================================================================================\n";
+    }
+}
+#endif
+
+#if (CUDNN_VERSION >= 8300)
+TEST_CASE("Conv two global scales", "[frontend][fusion][conv global scale]") {
+    std::cout << "Conv two global scales" << std::endl;
+    int64_t globalScaleDim[] = {1, 1, 1, 1};
+    int64_t xTensorDim[]     = {32, 32, 7, 7};
+    int64_t wTensorDim[]     = {256, 32, 1, 1};
+    int64_t yTensorDim[]     = {32, 256, 7, 7};
+
+    int64_t conv_padA[]      = {0, 0};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t XSize     = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t WSize     = wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3];
+    int64_t YSize     = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    int64_t scaleSize = globalScaleDim[0] * globalScaleDim[1] * globalScaleDim[2] * globalScaleDim[3];
+
+    Surface<half> X(XSize, false);
+    Surface<half> W(WSize, false);
+    Surface<half> afterConv(YSize, false);
+    Surface<half> Y(YSize, false);
+
+    Surface<float> scale1(scaleSize, false);
+    Surface<float> scale2(scaleSize, false);
+
+    auto status = run_conv_two_global_scales(xTensorDim,
+                                             wTensorDim,
+                                             yTensorDim,
+                                             globalScaleDim,
+                                             2,
+                                             conv_padA,
+                                             conv_dilationA,
+                                             conv_strideA,
+                                             X.devPtr,
+                                             W.devPtr,
+                                             scale1.devPtr,
+                                             scale2.devPtr,
+                                             Y.devPtr,
+                                             afterConv.devPtr);
+
+    if (status != CUDNN_STATUS_SUCCESS) {
+        return;
+    }
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(X.hostPtr, X.devPtr, (size_t)(sizeof(X.devPtr[0]) * XSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(W.hostPtr, W.devPtr, (size_t)(sizeof(W.devPtr[0]) * WSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(
+        afterConv.hostPtr, afterConv.devPtr, (size_t)(sizeof(afterConv.devPtr[0]) * YSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.devPtr[0]) * YSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(
+        scale1.hostPtr, scale1.devPtr, (size_t)(sizeof(scale1.devPtr[0]) * scaleSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(
+        scale2.hostPtr, scale2.devPtr, (size_t)(sizeof(scale2.devPtr[0]) * scaleSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    int numErrors = 0;
+    for (size_t i = 0; i < (size_t)YSize; i++) {
+        half afterConvOutput   = afterConv.hostPtr[i];
+        half finalOutput       = Y.hostPtr[i];
+        half globalScaleOutput = __float2half(__half2float(afterConvOutput) * scale1.hostPtr[0] * scale2.hostPtr[0]);
+        float diff             = getError(finalOutput, globalScaleOutput);
+        if (diff < 0) diff = -diff;
+        if (diff > THRESHOLD) {
+            numErrors++;
+        }
+    }
+
+    REQUIRE(numErrors == 0);
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8600)
+TEST_CASE("Conv Scale", "[frontend][fusion][ConvScaleReduction]") {
+    std::cout << "TEST_CASE :: Sample conv scale code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample conv scale code with backend API");
+    int64_t xTensorDim[] = {64, 128, 56, 56};
+    int64_t wTensorDim[] = {256, 128, 3, 3};
+    int64_t yTensorDim[] = {64, 256, 56, 56};
+
+    int64_t scaleDim[] = {1, 1, 1, 1};  // Scalar scale
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t amaxTensorDim[] = {1, 1, 1, 1};  // Output is AMAX of conv + scale
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << amaxTensorDim[0] << ", " << amaxTensorDim[1] << ", " << amaxTensorDim[2] << ", "
+              << amaxTensorDim[3] << std::endl;
+
+    int64_t outputSize = amaxTensorDim[0] * amaxTensorDim[1] * amaxTensorDim[2] * amaxTensorDim[3];
+
+    Surface<int8_t> X(xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3], false);
+    Surface<int8_t> W(wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3], false);
+    Surface<int8_t> Y(yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3], false);
+
+    Surface<float> scale(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+
+    run_fp8_conv_scale(xTensorDim,
+                       wTensorDim,
+                       yTensorDim,
+                       scaleDim,
+                       CUDNN_DATA_FP8_E4M3,
+                       2,
+                       conv_padA,
+                       conv_dilationA,
+                       conv_strideA,
+                       X.devPtr,
+                       W.devPtr,
+                       Y.devPtr,
+                       scale.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * outputSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8600)
+TEST_CASE("Conv Descale Descale Amax Scale sample", "[frontend][fusion][ConvScaleReduction]") {
+    std::cout << "TEST_CASE :: Sample conv scale global reduction code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample conv scale global reduction code with backend API");
+    int64_t xTensorDim[] = {64, 128, 56, 56};
+    int64_t wTensorDim[] = {256, 128, 3, 3};
+    int64_t yTensorDim[] = {64, 256, 56, 56};
+
+    int64_t scaleDim[] = {1, 1, 1, 1};  // Scalar scale
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    int64_t amaxTensorDim[] = {1, 1, 1, 1};  // Output is AMAX of conv + scale
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t inputSize  = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t filterSize = wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3];
+    int64_t outputSize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    int64_t amaxSize   = amaxTensorDim[0] * amaxTensorDim[1] * amaxTensorDim[2] * amaxTensorDim[3];
+
+    Surface<int8_t> X(inputSize, false);
+    Surface<int8_t> W(filterSize, false);
+    Surface<int8_t> Y(outputSize, false);
+
+    Surface<float> descale1(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> descale2(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> scale(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> Reduced(amaxSize, false);
+
+    run_fp8_conv_descale_descale_amax_scale(xTensorDim,
+                                            wTensorDim,
+                                            yTensorDim,
+                                            amaxTensorDim,
+                                            scaleDim,
+                                            CUDNN_DATA_FP8_E4M3,
+                                            2,
+                                            conv_padA,
+                                            conv_dilationA,
+                                            conv_strideA,
+                                            X.devPtr,
+                                            W.devPtr,
+                                            Reduced.devPtr,
+                                            Y.devPtr,
+                                            descale1.devPtr,
+                                            descale2.devPtr,
+                                            scale.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * outputSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(Reduced.hostPtr, Reduced.devPtr, (size_t)1, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8600)
+TEST_CASE("Scale transpose convert amax sample", "[frontend][fusion][Transpose]") {
+    std::cout << "TEST_CASE :: Sample scale transpose convert amax code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample scale transpose convert amax code with backend API");
+    int64_t xTensorDim[] = {1024, 8, 14, 14};
+    int64_t yTensorDim[] = {1024, 8, 14, 14};
+
+    int64_t scaleDim[] = {1, 1, 1, 1};  // Scalar scale
+
+    int64_t amaxTensorDim[] = {1, 1, 1, 1};  // Output is AMAX of conv + scale
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+    std::cout << "input dims are " << xTensorDim[0] << ", " << xTensorDim[1] << ", " << xTensorDim[2] << ", "
+              << xTensorDim[3] << std::endl;
+
+    std::cout << "output dims are " << yTensorDim[0] << ", " << yTensorDim[1] << ", " << yTensorDim[2] << ", "
+              << yTensorDim[3] << std::endl;
+
+    int64_t inputSize  = xTensorDim[0] * xTensorDim[1] * xTensorDim[2] * xTensorDim[3];
+    int64_t outputSize = yTensorDim[0] * yTensorDim[1] * yTensorDim[2] * yTensorDim[3];
+    int64_t amaxSize   = amaxTensorDim[0] * amaxTensorDim[1] * amaxTensorDim[2] * amaxTensorDim[3];
+
+    Surface<half> X(inputSize, false);
+    Surface<int8_t> Y(outputSize, false);
+
+    Surface<float> scale(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> Reduced(amaxSize, false);
+
+    run_tranpose_scale_convert_fp16_fp8_amax(xTensorDim,
+                                             yTensorDim,
+                                             amaxTensorDim,
+                                             scaleDim,
+                                             CUDNN_DATA_FP8_E4M3,
+                                             X.devPtr,
+                                             Reduced.devPtr,
+                                             Y.devPtr,
+                                             scale.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(Y.hostPtr, Y.devPtr, (size_t)(sizeof(Y.hostPtr[0]) * outputSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(Reduced.hostPtr, Reduced.devPtr, (size_t)1, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8700)
+TEST_CASE("Dgrad Descale Descale Amax Scale sample", "[frontend][fusion][ConvScaleReduction]") {
+    std::cout << "TEST_CASE :: Sample Dgrad scale global reduction code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample Dgrad scale global reduction code with backend API");
+    int64_t dxTensorDim[] = {64, 256, 14, 14};
+    int64_t wTensorDim[]  = {256, 256, 3, 3};
+    int64_t dyTensorDim[] = {0, 0, 0, 0};  // Computed below
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    dyTensorDim[0] = dxTensorDim[0];
+    dyTensorDim[1] = wTensorDim[0];
+    for (int dim = 0; dim < 2; dim++) {
+        dyTensorDim[dim + 2] = getFwdConvOutputDim(
+            dxTensorDim[dim + 2], conv_padA[dim], wTensorDim[dim + 2], conv_strideA[dim], conv_dilationA[dim]);
+    }
+
+    int64_t scaleDim[]      = {1, 1, 1, 1};  // Scalar scale
+    int64_t amaxTensorDim[] = {1, 1, 1, 1};  // Output is AMAX of conv + scale
+
+    std::cout << "====DIMENSIONS====" << std::endl;
+
+    std::cout << "dx dims are " << dxTensorDim[0] << ", " << dxTensorDim[1] << ", " << dxTensorDim[2] << ", "
+              << dxTensorDim[3] << std::endl;
+
+    std::cout << "filter dims are " << wTensorDim[0] << ", " << wTensorDim[1] << ", " << wTensorDim[2] << ", "
+              << wTensorDim[3] << std::endl;
+
+    std::cout << "dy dims are " << dyTensorDim[0] << ", " << dyTensorDim[1] << ", " << dyTensorDim[2] << ", "
+              << dyTensorDim[3] << std::endl;
+
+    int64_t dxSize     = dxTensorDim[0] * dxTensorDim[1] * dxTensorDim[2] * dxTensorDim[3];
+    int64_t filterSize = wTensorDim[0] * wTensorDim[1] * wTensorDim[2] * wTensorDim[3];
+    int64_t dySize     = dyTensorDim[0] * dyTensorDim[1] * dyTensorDim[2] * dyTensorDim[3];
+    int64_t amaxSize   = amaxTensorDim[0] * amaxTensorDim[1] * amaxTensorDim[2] * amaxTensorDim[3];
+
+    Surface<int8_t> dX(dxSize, false);
+    Surface<int8_t> W(filterSize, false);
+    Surface<int8_t> dY(dySize, false);
+
+    Surface<float> descale1(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> descale2(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> scale(scaleDim[0] * scaleDim[1] * scaleDim[2] * scaleDim[3], false);
+    Surface<float> Reduced(amaxSize, false);
+
+    run_fp8_dgrad_descale_descale_amax_scale(dxTensorDim,
+                                             wTensorDim,
+                                             dyTensorDim,
+                                             amaxTensorDim,
+                                             scaleDim,
+                                             CUDNN_DATA_FP8_E4M3,
+                                             2,
+                                             conv_padA,
+                                             conv_dilationA,
+                                             conv_strideA,
+                                             dX.devPtr,
+                                             W.devPtr,
+                                             Reduced.devPtr,
+                                             dY.devPtr,
+                                             descale1.devPtr,
+                                             descale2.devPtr,
+                                             scale.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(dX.hostPtr, dX.devPtr, (size_t)(sizeof(dX.hostPtr[0]) * dxSize), cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaMemcpy(Reduced.hostPtr, Reduced.devPtr, (size_t)1, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 90800)
+TEST_CASE("BN BWD Weights sample", "[frontend][fusion][BnBwdWeight]") {
+    std::cout << "TEST_CASE :: BN BWD Weights sample" << std::endl;
+    INFO("TEST_CASE :: Sample BN BWD Weights sample");
+
+    constexpr int64_t C = 8;
+    constexpr int64_t N = 2;
+    constexpr int64_t H = 56;
+    constexpr int64_t W = 56;
+    constexpr int64_t K = 16;
+    constexpr int64_t R = 3;
+    constexpr int64_t S = 3;
+
+    int64_t scaleDim[] = {1, C, 1, 1};
+
+    int64_t xDim[]  = {N, C, H, W};
+    int64_t wDim[]  = {K, C, R, S};
+    int64_t dyDim[] = {N, K, 1, 1};  // Computed below
+
+    int64_t conv_padA[]      = {1, 1};
+    int64_t conv_dilationA[] = {1, 1};
+    int64_t conv_strideA[]   = {1, 1};
+
+    dyDim[0] = dyDim[0];
+    dyDim[1] = wDim[0];
+    for (int dim = 0; dim < 2; dim++) {
+        dyDim[dim + 2] =
+            getFwdConvOutputDim(xDim[dim + 2], conv_padA[dim], wDim[dim + 2], conv_strideA[dim], conv_dilationA[dim]);
+    }
+
+    int64_t xSize        = xDim[0] * xDim[1] * xDim[2] * xDim[3];
+    int64_t wSize        = wDim[0] * wDim[1] * wDim[2] * wDim[3];
+    int64_t ySize        = dyDim[0] * dyDim[1] * dyDim[2] * dyDim[3];
+    int64_t perScaleSize = C;
+
+    Surface<half> x_bn_forward(xSize, false);
+    Surface<half> dy_bn(xSize, false);
+    Surface<half> w_forward(wSize, false);
+    Surface<half> dy(ySize, false);
+
+    Surface<float> scale(perScaleSize, false);
+    Surface<float> bias(perScaleSize, false);
+    Surface<float> mean(perScaleSize, false);
+    Surface<float> inv_var(perScaleSize, false);
+
+    Surface<float> d_scale(perScaleSize, false);
+    Surface<float> d_bias(perScaleSize, false);
+
+    Surface<float> eqscale_dy(perScaleSize, false);
+    Surface<float> eqscale_x(perScaleSize, false);
+    Surface<float> eqbias(perScaleSize, false);
+
+    run_bn_bwd_weight(xDim,
+                      dyDim,
+                      wDim,
+                      scaleDim,
+                      x_bn_forward.devPtr,
+                      w_forward.devPtr,
+                      dy.devPtr,
+                      dy_bn.devPtr,
+                      mean.devPtr,
+                      inv_var.devPtr,
+                      scale.devPtr,
+                      bias.devPtr,
+                      d_scale.devPtr,
+                      d_bias.devPtr,
+                      eqscale_dy.devPtr,
+                      eqscale_x.devPtr,
+                      eqbias.devPtr);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8700)
+TEST_CASE("Back2Back Batch GEMM sample", "[frontend][fusion][back2backBatchGemm]") {
+    std::cout << "TEST_CASE :: Sample back2back batch gemm code with backend API" << std::endl;
+    INFO("TEST_CASE :: Sample back2back batch gemm code with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t qTensorDim[] = {32, 16, 512, 64};
+    int64_t kTensorDim[] = {32, 16, 64, 512};
+    int64_t sTensorDim[] = {32, 16, 512, 512};
+    int64_t vTensorDim[] = {32, 16, 512, 64};
+    int64_t oTensorDim[] = {32, 16, 512, 64};
+
+    int64_t qTensorStride[] = {524288, 64, 1024, 1};
+    int64_t kTensorStride[] = {524288, 64, 1, 1024};
+    int64_t sTensorStride[] = {4194304, 262144, 512, 1};
+    int64_t vTensorStride[] = {524288, 64, 1024, 1};
+    int64_t oTensorStride[] = {524288, 64, 1024, 1};
+    std::cout << "====DIMENSIONS====" << std::endl;
+
+    std::cout << "q dims are " << qTensorDim[0] << ", " << qTensorDim[1] << ", " << qTensorDim[2] << ", "
+              << qTensorDim[3] << std::endl;
+
+    std::cout << "k dims are " << kTensorDim[0] << ", " << kTensorDim[1] << ", " << kTensorDim[2] << ", "
+              << kTensorDim[3] << std::endl;
+
+    std::cout << "s dims are " << sTensorDim[0] << ", " << sTensorDim[1] << ", " << sTensorDim[2] << ", "
+              << sTensorDim[3] << std::endl;
+
+    std::cout << "v dims are " << vTensorDim[0] << ", " << vTensorDim[1] << ", " << vTensorDim[2] << ", "
+              << vTensorDim[3] << std::endl;
+
+    std::cout << "o dims are " << oTensorDim[0] << ", " << oTensorDim[1] << ", " << oTensorDim[2] << ", "
+              << oTensorDim[3] << std::endl;
+
+    int64_t qSize = qTensorDim[0] * qTensorDim[1] * qTensorDim[2] * qTensorDim[3];
+    int64_t kSize = kTensorDim[0] * kTensorDim[1] * kTensorDim[2] * kTensorDim[3];
+    int64_t vSize = vTensorDim[0] * vTensorDim[1] * vTensorDim[2] * vTensorDim[3];
+    int64_t oSize = oTensorDim[0] * oTensorDim[1] * oTensorDim[2] * oTensorDim[3];
+
+    // passing half just to make sure that we have a data type of same size as bf16
+    Surface<half> qTensor(qSize, false);
+    Surface<half> kTensor(kSize, false);
+    Surface<half> vTensor(vSize, false);
+    Surface<half> oTensor(oSize, false);
+
+    run_b2b_batch_gemm(qTensorDim,
+                       kTensorDim,
+                       sTensorDim,
+                       vTensorDim,
+                       oTensorDim,
+                       qTensor.devPtr,
+                       kTensor.devPtr,
+                       vTensor.devPtr,
+                       oTensor.devPtr,
+                       CUDNN_DATA_HALF,
+                       4,
+                       qTensorStride,
+                       kTensorStride,
+                       sTensorStride,
+                       vTensorStride,
+                       oTensorStride);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(oTensor.hostPtr, oTensor.devPtr, sizeof(oTensor.hostPtr[0]) * oSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("MHA Fprop sample", "[frontend][fusion][mhaFprop]") {
+    std::cout << "TEST_CASE :: MHA Fprop with backend API" << std::endl;
+    INFO("TEST_CASE ::  MHA Fprop with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t b    = 32;   // batch size
+    int64_t h    = 16;   // head dim
+    int64_t s_q  = 512;  // q tensor is padded to this seq length
+    int64_t s_kv = 512;  // k and v tensor is padded to this seq length
+    int64_t d    = 64;   // hidden dim
+
+    int64_t seed = 123456;  // seed for generating the dropout mask
+
+    MHA_Layout layout = MHA_Layout::QKV_INTERLEAVED;  // layout of the tensors Q,K and V
+
+    // this scaling factor needs to be bfloat16 for data type bfloat16
+    half1 scaling_factor = cpu_float2half_rn(0.8f);  // scale value before softmax
+
+    double dropout_probability = 0.2f;  // probability of dropout
+
+    MHA_Bias_Type bias_type = MHA_Bias_Type::NO_BIAS;  // set which bias is required
+
+    bool is_causal_masking = false;  // specify if we need causal masking
+
+    std::cout << "====PARAMETERS====" << std::endl;
+    std::cout << "batch is " << b << ", head dim is " << h << ", q sequence length is " << s_q
+              << ", kv sequence length is " << s_kv << ", hidden dim is " << d << std::endl;
+
+    void* devPtrQ    = nullptr;  // queries
+    void* devPtrK    = nullptr;  // keys
+    void* devPtrV    = nullptr;  // values
+    void* devPtrS    = nullptr;  // after softmax output
+    void* devPtrO    = nullptr;  // final output
+    void* devPtrBias = nullptr;  // bias tensor
+
+    int* devActualSeqlenQ = nullptr;  // actual seqlen Q
+    int* devActualSeqlenK = nullptr;  // actual seqlen K
+
+    int* hostActualSeqlenQ = nullptr;
+    int* hostActualSeqlenK = nullptr;
+
+    // the setup is for the qkv interleaved layout (qkv interleaved assumes s_q = s_kv)
+    int64_t qkvTensorDim[] = {b, s_q, 3, h, d};
+    CUDNN_FRONTEND_UNUSED(qkvTensorDim);
+
+    int64_t qkvSize = b * s_q * 3 * h * d;
+    Surface<half> qkvTensor(qkvSize, false);
+    devPtrQ = (void*)qkvTensor.devPtr;                // q points to the top of qkv
+    devPtrK = (void*)(qkvTensor.devPtr + h * d);      // k is at an offset of h * d
+    devPtrV = (void*)(qkvTensor.devPtr + 2 * h * d);  // v is at an offset of 2 * h * d
+
+    // optionally setup S and bias
+    Surface<half> sTensor(b * h * s_q * s_kv, false);
+    devPtrS = (void*)sTensor.devPtr;
+
+    // setup of actual seqlen Q and seqlen K
+    checkCudaErr(cudaMalloc((void**)&(devActualSeqlenQ), (b) * sizeof(devActualSeqlenQ[0])));
+    hostActualSeqlenQ = (int*)calloc(b, sizeof(hostActualSeqlenQ[0]));
+
+    for (int i = 0; i < b; i++) {
+        hostActualSeqlenQ[i] = 128;
+    }
+
+    checkCudaErr(
+        cudaMemcpy(devActualSeqlenQ, hostActualSeqlenQ, sizeof(hostActualSeqlenQ[0]) * b, cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMalloc((void**)&(devActualSeqlenK), (b) * sizeof(devActualSeqlenK[0])));
+    hostActualSeqlenK = (int*)calloc(b, sizeof(hostActualSeqlenK[0]));
+
+    for (int i = 0; i < b; i++) {
+        hostActualSeqlenK[i] = 128;
+    }
+
+    checkCudaErr(
+        cudaMemcpy(devActualSeqlenK, hostActualSeqlenK, sizeof(hostActualSeqlenK[0]) * b, cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    int64_t oSize = b * s_q * h * d;
+    Surface<half> oTensor(oSize, false);
+    devPtrO = (void*)oTensor.devPtr;
+
+    run_mha_fprop(b,
+                  h,
+                  s_q,
+                  s_kv,
+                  d,
+                  seed,
+                  layout,
+                  scaling_factor,
+                  dropout_probability,
+                  bias_type,
+                  is_causal_masking,
+                  devPtrQ,
+                  devPtrK,
+                  devPtrV,
+                  devPtrS,
+                  devPtrO,
+                  devPtrBias,
+                  devActualSeqlenQ,
+                  devActualSeqlenK,
+                  CUDNN_DATA_HALF);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(oTensor.hostPtr, oTensor.devPtr, sizeof(oTensor.hostPtr[0]) * oSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    if (devActualSeqlenQ) cudaFree(devActualSeqlenQ);
+    if (hostActualSeqlenQ) free(hostActualSeqlenQ);
+
+    if (devActualSeqlenK) cudaFree(devActualSeqlenK);
+    if (hostActualSeqlenK) free(hostActualSeqlenK);
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8800)
+TEST_CASE("MHA Bprop sample", "[frontend][fusion][mhaBprop]") {
+    std::cout << "TEST_CASE :: MHA Bprop with backend API" << std::endl;
+    INFO("TEST_CASE ::  MHA Bprop with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t b    = 32;   // batch size
+    int64_t h    = 16;   // head dim
+    int64_t s_q  = 512;  // q tensor is padded to this seq length
+    int64_t s_kv = 512;  // k and v tensor is padded to this seq length
+    int64_t d    = 64;   // hidden dim
+
+    MHA_Layout layout = MHA_Layout::QKV_INTERLEAVED;  // layout of the tensors Q,K and V
+
+    float scaling_factor      = 0.8f;  // scale value before softmax
+    float dropout_probability = 0.2f;  // probability of dropout
+
+    bool is_causal_masking = false;  // specify if we need causal masking
+
+    std::cout << "====PARAMETERS====" << std::endl;
+    std::cout << "batch is " << b << ", head dim is " << h << ", q sequence length is " << s_q
+              << ", kv sequence length is " << s_kv << ", hidden dim is " << d << std::endl;
+
+    void* devPtrQ = nullptr;  // queries
+    void* devPtrK = nullptr;  // keys
+    void* devPtrV = nullptr;  // values
+
+    void* devPtrdQ = nullptr;  // derivative of queries
+    void* devPtrdK = nullptr;  // derivative of keys
+    void* devPtrdV = nullptr;  // derivative of values
+
+    void* devPtrS  = nullptr;  // after softmax output from fprop kernel
+    void* devPtrdS = nullptr;  // bprop kernel emits this tensor
+    void* devPtrdO = nullptr;  // input to the bprop, derivative of output
+
+    int* devActualSeqlenQ = nullptr;  // actual seqlen Q
+    int* devActualSeqlenK = nullptr;  // actual seqlen K
+
+    int* hostActualSeqlenQ = nullptr;
+    int* hostActualSeqlenK = nullptr;
+
+    // the setup is for the qkv interleaved layout (qkv interleaved assumes s_q = s_kv)
+    int64_t qkvTensorDim[] = {b, s_q, 3, h, d};
+    CUDNN_FRONTEND_UNUSED(qkvTensorDim);
+
+    int64_t qkvSize = b * s_q * 3 * h * d;
+    Surface<half> qkvTensor(qkvSize, false);
+    devPtrQ = (void*)qkvTensor.devPtr;                // q points to the top of qkv
+    devPtrK = (void*)(qkvTensor.devPtr + h * d);      // k is at an offset of h * d
+    devPtrV = (void*)(qkvTensor.devPtr + 2 * h * d);  // v is at an offset of 2 * h * d
+
+    int64_t dqkvSize = b * s_q * 3 * h * d;
+    Surface<half> dqkvTensor(dqkvSize, false);
+    devPtrdQ = (void*)dqkvTensor.devPtr;                // dq points to the top of dqkv
+    devPtrdK = (void*)(dqkvTensor.devPtr + h * d);      // dk is at an offset of h * d
+    devPtrdV = (void*)(dqkvTensor.devPtr + 2 * h * d);  // dv is at an offset of 2 * h * d
+
+    // setup S (should be taken from fprop kernel)
+    Surface<half> sTensor(b * h * s_q * s_kv, false);
+    devPtrS = (void*)sTensor.devPtr;
+
+    // setup dS (output from bprop kernel)
+    Surface<half> dsTensor(b * h * s_q * s_kv, false);
+    devPtrdS = (void*)dsTensor.devPtr;
+
+    // setup of actual seqlen Q and seqlen K
+    checkCudaErr(cudaMalloc((void**)&(devActualSeqlenQ), (b) * sizeof(devActualSeqlenQ[0])));
+    hostActualSeqlenQ = (int*)calloc(b, sizeof(hostActualSeqlenQ[0]));
+
+    for (int i = 0; i < b; i++) {
+        hostActualSeqlenQ[i] = 128;
+    }
+
+    checkCudaErr(
+        cudaMemcpy(devActualSeqlenQ, hostActualSeqlenQ, sizeof(hostActualSeqlenQ[0]) * b, cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMalloc((void**)&(devActualSeqlenK), (b) * sizeof(devActualSeqlenK[0])));
+    hostActualSeqlenK = (int*)calloc(b, sizeof(hostActualSeqlenK[0]));
+
+    for (int i = 0; i < b; i++) {
+        hostActualSeqlenK[i] = 128;
+    }
+
+    checkCudaErr(
+        cudaMemcpy(devActualSeqlenK, hostActualSeqlenK, sizeof(hostActualSeqlenK[0]) * b, cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    int64_t doSize = b * s_q * h * d;
+    Surface<half> doTensor(doSize, false);
+    devPtrdO = (void*)doTensor.devPtr;
+
+    run_mha_bprop(b,
+                  h,
+                  s_q,
+                  s_kv,
+                  d,
+                  layout,
+                  scaling_factor,
+                  dropout_probability,
+                  is_causal_masking,
+                  devPtrQ,
+                  devPtrK,
+                  devPtrV,
+                  devPtrS,
+                  devPtrdQ,
+                  devPtrdK,
+                  devPtrdV,
+                  devPtrdO,
+                  devPtrdS,
+                  devActualSeqlenQ,
+                  devActualSeqlenK,
+                  CUDNN_DATA_HALF);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(
+        dqkvTensor.hostPtr, dqkvTensor.devPtr, sizeof(dqkvTensor.hostPtr[0]) * dqkvSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(
+        dsTensor.hostPtr, dsTensor.devPtr, sizeof(dsTensor.hostPtr[0]) * b * h * s_q * s_kv, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    if (devActualSeqlenQ) cudaFree(devActualSeqlenQ);
+    if (hostActualSeqlenQ) free(hostActualSeqlenQ);
+
+    if (devActualSeqlenK) cudaFree(devActualSeqlenK);
+    if (hostActualSeqlenK) free(hostActualSeqlenK);
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8900)
+TEST_CASE("BF16 LLM Flash MHA Fprop sample", "[frontend][fusion][BF16LLMFprop]") {
+    std::cout << "TEST_CASE :: BF16 LLM Flash MHA Fprop with backend API" << std::endl;
+    INFO("TEST_CASE ::  BF16 LLM Flash MHA Fprop with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t b    = 2;     // batch size
+    int64_t h    = 12;    // head dim
+    int64_t s_q  = 2048;  // q tensor is padded to this seq length
+    int64_t s_kv = 2048;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+    int64_t seed = 123456;  // seed for generating the dropout mask
+
+    MHA_Layout layout =
+        MHA_Layout::SBH_INTERLEAVED;  // layout of the tensors Q,K and V. BF16 LLM has layout [S,B,H,3,D]
+
+    float scaling_factor = 0.5;  // scale value before softmax
+
+    bool isTraining            = true;  // training or inference mode
+    double dropout_probability = 0.2f;  // probability of dropout. Should be 0.0 for inference mode
+
+    std::cout << "====PARAMETERS====" << std::endl;
+    std::cout << "batch is " << b << ", head dim is " << h << ", q sequence length is " << s_q
+              << ", kv sequence length is " << s_kv << ", hidden dim is " << d << std::endl;
+
+    void* devPtrQ            = nullptr;  // queries
+    void* devPtrK            = nullptr;  // keys
+    void* devPtrV            = nullptr;  // values
+    void* devPtrSoftmaxStats = nullptr;  // softmax stats
+    void* devPtrO            = nullptr;  // final output
+
+    // the setup is for the qkv interleaved layout (qkv interleaved assumes s_q = s_kv)
+    int64_t qkvTensorDim[] = {s_q, b, h, 3, d};
+    CUDNN_FRONTEND_UNUSED(qkvTensorDim);
+
+    int64_t xSize = s_q * b * h * 3 * d;
+    Surface<half> xTensor(xSize, false);
+    devPtrQ = (void*)xTensor.devPtr;            // q points to the top of qkv
+    devPtrK = (void*)(xTensor.devPtr + d);      // k is at an offset of d
+    devPtrV = (void*)(xTensor.devPtr + 2 * d);  // v is at an offset of 2 * d
+
+    void* devPtrDropoutSeed   = nullptr;  // Seed for dropout
+    void* devPtrDropoutOffset = nullptr;  // Offset for dropout
+
+    int64_t scaleSize = 1;
+    Surface<int64_t> dropoutSeed(scaleSize, false, seed);
+    devPtrDropoutSeed = (void*)dropoutSeed.devPtr;
+    Surface<int64_t> dropoutOffset(scaleSize, false, (int64_t)1);
+    devPtrDropoutOffset = (void*)dropoutOffset.devPtr;
+
+    int64_t softmaxStatsSize = b * h * s_q;
+    Surface<float> softmaxStats(softmaxStatsSize, false);
+    if (isTraining) {
+        devPtrSoftmaxStats = (void*)softmaxStats.devPtr;
+    }
+
+    int64_t oSize = b * s_q * h * d;
+    Surface<half> oTensor(oSize, false);
+    devPtrO = (void*)oTensor.devPtr;
+
+    run_f16_flash_attention_fprop(b,
+                                  h,
+                                  s_q,
+                                  s_kv,
+                                  d,
+                                  layout,
+                                  scaling_factor,
+                                  isTraining,
+                                  dropout_probability,
+                                  devPtrQ,
+                                  devPtrK,
+                                  devPtrV,
+                                  devPtrSoftmaxStats,
+                                  devPtrO,
+                                  devPtrDropoutSeed,
+                                  devPtrDropoutOffset,
+                                  CUDNN_DATA_BFLOAT16);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(oTensor.hostPtr, oTensor.devPtr, sizeof(oTensor.hostPtr[0]) * oSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("BF16 LLM Flash MHA Bprop sample", "[frontend][fusion][BF16LLMBprop]") {
+    std::cout << "TEST_CASE :: BF16 LLM Flash MHA Bprop with backend API" << std::endl;
+    INFO("TEST_CASE ::  BF16 LLM Flash MHA Bprop with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t b    = 2;     // batch size
+    int64_t h    = 12;    // head dim
+    int64_t s_q  = 2048;  // q tensor is padded to this seq length
+    int64_t s_kv = 2048;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+    MHA_Layout layout =
+        MHA_Layout::SBH_INTERLEAVED;  // layout of the tensors Q,K and V. BF16 LLM has layout [S,B,H,3,D]
+
+    float scaling_factor      = 0.8f;  // scale value before softmax
+    float dropout_probability = 0.2f;  // probability of dropout
+
+    int64_t seed = 123456;  // seed for generating the dropout mask
+
+    std::cout << "====PARAMETERS====" << std::endl;
+    std::cout << "batch is " << b << ", head dim is " << h << ", q sequence length is " << s_q
+              << ", kv sequence length is " << s_kv << ", hidden dim is " << d << std::endl;
+
+    void* devPtrQ          = nullptr;  // queries
+    void* devPtrKTranspose = nullptr;  // keys transposed
+    void* devPtrVTranspose = nullptr;  // values transposed
+    void* devPtrO          = nullptr;  // final output from fprop
+
+    void* devPtrdQ      = nullptr;  // derivative of queries
+    void* devPtrdQAccum = nullptr;  // derivative of queries accumulator
+    void* devPtrdK      = nullptr;  // derivative of keys
+    void* devPtrdV      = nullptr;  // derivative of values
+
+    void* devPtrSoftmaxStats = nullptr;  // softmax stats
+    void* devPtrSoftmaxSum   = nullptr;  // softmax sum
+
+    void* devPtrDropoutSeed   = nullptr;  // Seed for dropout
+    void* devPtrDropoutOffset = nullptr;  // Offset for dropout
+
+    void* devPtrdO = nullptr;  // input to the bprop, derivative of output
+
+    // the setup is for the qkv interleaved layout (qkv interleaved assumes s_q = s_kv)
+    int64_t qkvTensorDim[] = {s_q, b, h, 3, d};
+    CUDNN_FRONTEND_UNUSED(qkvTensorDim);
+
+    int64_t qkvSize = b * s_q * 3 * h * d;
+    Surface<half> qkvTensor(qkvSize, false);
+    devPtrQ          = (void*)qkvTensor.devPtr;            // q points to the top of qkv
+    devPtrKTranspose = (void*)(qkvTensor.devPtr + d);      // k is at an offset of d
+    devPtrVTranspose = (void*)(qkvTensor.devPtr + 2 * d);  // v is at an offset of 2 * d
+
+    int64_t softmaxStatsSize = b * h * s_q;
+    Surface<float> softmaxStats(softmaxStatsSize, false);
+    devPtrSoftmaxStats = (void*)softmaxStats.devPtr;
+    Surface<float> softmaxSum(softmaxStatsSize, false);
+    devPtrSoftmaxSum = (void*)softmaxSum.devPtr;
+
+    int64_t dqkvSize = b * s_q * 3 * h * d;
+    Surface<half> dqkvTensor(dqkvSize, false);
+    devPtrdQ = (void*)dqkvTensor.devPtr;            // dq points to the top of dqkv
+    devPtrdK = (void*)(dqkvTensor.devPtr + d);      // dk is at an offset of h * d
+    devPtrdV = (void*)(dqkvTensor.devPtr + 2 * d);  // dv is at an offset of 2 * h * d
+
+    int64_t dqAccumSize = b * s_q * h * d;
+    Surface<float> dqAccumTensor(dqAccumSize, false);
+    devPtrdQAccum = (void*)dqAccumTensor.devPtr;
+    // dqAccumulator needs to be memset to 0 before being passed into the kernel
+    checkCudaErr(cudaMemset(devPtrdQAccum, 0, dqAccumSize * sizeof(float)));
+
+    int64_t scaleSize = 1;
+    Surface<int64_t> dropoutSeed(scaleSize, false, seed);
+    devPtrDropoutSeed = (void*)dropoutSeed.devPtr;
+    Surface<int64_t> dropoutOffset(scaleSize, false, (int64_t)1);
+    devPtrDropoutOffset = (void*)dropoutOffset.devPtr;
+
+    int64_t oSize = b * s_q * h * d;
+    Surface<half> oTensor(oSize, false);
+    devPtrO = (void*)oTensor.devPtr;
+    Surface<half> doTensor(oSize, false);
+    devPtrdO = (void*)doTensor.devPtr;
+
+    run_f16_flash_attention_bprop(b,
+                                  h,
+                                  s_q,
+                                  s_kv,
+                                  d,
+                                  layout,
+                                  scaling_factor,
+                                  dropout_probability,
+                                  devPtrQ,
+                                  devPtrKTranspose,
+                                  devPtrVTranspose,
+                                  devPtrO,
+                                  devPtrSoftmaxStats,
+                                  devPtrSoftmaxSum,
+                                  devPtrdQAccum,
+                                  devPtrdQ,
+                                  devPtrdK,
+                                  devPtrdV,
+                                  devPtrdO,
+                                  devPtrDropoutSeed,
+                                  devPtrDropoutOffset,
+                                  CUDNN_DATA_BFLOAT16);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(
+        dqkvTensor.hostPtr, dqkvTensor.devPtr, sizeof(dqkvTensor.hostPtr[0]) * dqkvSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8900)
+TEST_CASE("FP8 Flash MHA Fprop sample", "[frontend][fusion][fp8flashmhaFprop]") {
+    std::cout << "TEST_CASE :: FP8 Flash MHA Fprop with backend API" << std::endl;
+    INFO("TEST_CASE :: FP8 Flash MHA Fprop with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t b    = 48;   // batch size
+    int64_t h    = 16;   // head dim
+    int64_t s_q  = 512;  // q tensor is padded to this seq length
+    int64_t s_kv = 512;  // k and v tensor is padded to this seq length
+    int64_t d    = 64;   // hidden dim
+
+    MHA_Layout layout = MHA_Layout::QKV_INTERLEAVED;  // layout of the tensors Q,K and V
+
+    // this scaling factor needs to be bfloat16 for data type bfloat16
+    float attnScale          = 0.125f;  // scale value before softmax
+    bool isTraining          = true;    // is training or inference
+    float dropoutProbability = 0.0f;    // probability of dropout. If inference, dropout should be 0.0f
+    int64_t seed             = 123456;  // seed for generating the dropout mask
+
+    std::cout << "====PARAMETERS====" << std::endl;
+    std::cout << "batch is " << b << ", head dim is " << h << ", q sequence length is " << s_q
+              << ", kv sequence length is " << s_kv << ", hidden dim is " << d << std::endl;
+
+    void* devPtrQKV  = nullptr;  // QKV interleaved tensor
+    void* devPtrM    = nullptr;  // M tensor (row reduction max of QK.T)
+    void* devPtrZInv = nullptr;  // ZInv tensor (1 / row reduction sum of exponention of e^(x-M))
+    void* devPtrO    = nullptr;  // final output
+
+    int* devPtrActualSeqlenOverride = nullptr;  // actual seqlen override (MNK override)
+    int* devPtrQKVRaggedOffset      = nullptr;  // Offset overrides for QKV
+    int* devPtrORaggedOffset        = nullptr;  // Offset override for O
+
+    void* devPtrDropoutSeed   = nullptr;  // Dropout seed
+    void* devPtrDropoutOffset = nullptr;  // Dropout offset used in Philox RNG
+
+    // ================ FP8 tensors ===============================
+    void* devPtrDescaleQ = nullptr;
+    void* devPtrDescaleK = nullptr;
+    void* devPtrDescaleV = nullptr;
+    void* devPtrDescaleS = nullptr;
+    void* devPtrScaleS   = nullptr;
+    void* devPtrScaleO   = nullptr;
+    void* devPtrAmaxO    = nullptr;
+    void* devPtrAmaxS    = nullptr;
+
+    int* hostActualSeqlenOverride = nullptr;  // MNK override
+    int* hostPtrQKVRaggedOffset   = nullptr;
+    int* hostPtrORaggedOffset     = nullptr;
+
+    // the setup is for the qkv interleaved layout (qkv interleaved assumes s_q = s_kv)
+    int64_t qkvTensorDim[] = {b, s_q, 3, h, d};
+    CUDNN_FRONTEND_UNUSED(qkvTensorDim);
+
+    /*********All FP8 Tensors have values initialized in range [0, 50]******************/
+
+    int64_t qkvSize = b * s_q * 3 * h * d;
+    Surface<uint8_t> qkvTensor(qkvSize, false);
+    devPtrQKV = (void*)qkvTensor.devPtr;
+
+    int64_t reductionSize = b * h * s_q;
+    Surface<float> mTensor(reductionSize, false);
+    Surface<float> zInvTensor(reductionSize, false);
+    // Set M and Z INV device pointers up if training mode. Defaults to nullptr
+    if (isTraining) {
+        devPtrM    = (void*)mTensor.devPtr;
+        devPtrZInv = (void*)zInvTensor.devPtr;
+    }
+
+    int64_t scalarSize = 1;
+    Surface<int64_t> dropoutSeed(scalarSize, false, seed);
+    devPtrDropoutSeed = (void*)dropoutSeed.devPtr;
+    Surface<int64_t> dropoutOffset(scalarSize, false, (int64_t)1);
+    devPtrDropoutOffset = (void*)dropoutOffset.devPtr;
+
+    Surface<float> descaleQ(scalarSize, false, 1.0f);
+    devPtrDescaleQ = (void*)descaleQ.devPtr;
+    Surface<float> descaleK(scalarSize, false, 1.0f);
+    devPtrDescaleK = (void*)descaleK.devPtr;
+    Surface<float> descaleV(scalarSize, false, 1.0f);
+    devPtrDescaleV = (void*)descaleV.devPtr;
+    Surface<float> descaleS(scalarSize, false, 1.0f);
+    devPtrDescaleS = (void*)descaleS.devPtr;
+
+    Surface<float> scaleS(scalarSize, false, 1.0f);
+    devPtrScaleS = (void*)scaleS.devPtr;
+    Surface<float> scaleO(scalarSize, false, 1.0f);
+    devPtrScaleO = (void*)scaleO.devPtr;
+
+    Surface<float> amaxO(scalarSize, false);
+    devPtrAmaxO = (void*)amaxO.devPtr;
+    Surface<float> amaxS(scalarSize, false);
+    devPtrAmaxS = (void*)amaxS.devPtr;
+
+    // setup of actual seqlen Q and seqlen K and seqlen O
+    checkCudaErr(cudaMalloc((void**)&(devPtrActualSeqlenOverride), (b) * sizeof(devPtrActualSeqlenOverride[0])));
+    hostActualSeqlenOverride = (int*)calloc(b, sizeof(hostActualSeqlenOverride[0]));
+
+    for (int i = 0; i < b; i++) {
+        // random number between 16 and 512 for host seq len
+        hostActualSeqlenOverride[i] = rand() % (s_q - 16 + 1) + 16;
+    }
+
+    checkCudaErr(cudaMemcpy(devPtrActualSeqlenOverride,
+                            hostActualSeqlenOverride,
+                            sizeof(hostActualSeqlenOverride[0]) * b,
+                            cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMalloc((void**)&(devPtrQKVRaggedOffset), (b + 1) * sizeof(devPtrQKVRaggedOffset[0])));
+    hostPtrQKVRaggedOffset = (int*)calloc(b + 1, sizeof(hostPtrQKVRaggedOffset[0]));  // ragged offset has b+1 elements
+
+    std::vector<int64_t> QKVprefixSum;
+    QKVprefixSum.resize(b + 1);
+    for (int i = 0; i < b + 1; i++) {
+        // Calculate prefix sum of hostActualSeqLenK
+        if (i == 0) {
+            QKVprefixSum[i] = 0;
+        } else {
+            QKVprefixSum[i] = QKVprefixSum[i - 1] + hostActualSeqlenOverride[i - 1];
+        }
+    }
+
+    int64_t offsetStride = h * d;
+    // Variable sequence lengths for QKV and O
+    for (int i = 0; i < b + 1; i++) {
+        hostPtrQKVRaggedOffset[i] = static_cast<int32_t>(3 * offsetStride * QKVprefixSum[i]);
+    }
+
+    checkCudaErr(cudaMemcpy(devPtrQKVRaggedOffset,
+                            hostPtrQKVRaggedOffset,
+                            sizeof(hostPtrQKVRaggedOffset[0]) * (b + 1),
+                            cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMalloc((void**)&(devPtrORaggedOffset), (b + 1) * sizeof(devPtrORaggedOffset[0])));
+    hostPtrORaggedOffset = (int*)calloc(b + 1, sizeof(hostPtrORaggedOffset[0]));  // ragged offset has b+1 elements
+
+    for (int i = 0; i < b + 1; i++) {
+        hostPtrORaggedOffset[i] = static_cast<int32_t>(offsetStride * QKVprefixSum[i]);
+    }
+
+    checkCudaErr(cudaMemcpy(
+        devPtrORaggedOffset, hostPtrORaggedOffset, sizeof(hostPtrORaggedOffset[0]) * (b + 1), cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    int64_t oSize = b * s_q * h * d;
+    Surface<uint8_t> oTensor(oSize, false);
+    devPtrO = (void*)oTensor.devPtr;
+
+    run_fp8_flash_mha_fprop(b,
+                            h,
+                            s_q,
+                            s_kv,
+                            d,
+                            attnScale,
+                            isTraining,
+                            dropoutProbability,
+                            layout,
+                            devPtrQKV,
+                            devPtrM,
+                            devPtrZInv,
+                            devPtrO,
+                            devPtrDropoutSeed,
+                            devPtrDropoutOffset,
+                            devPtrDescaleQ,
+                            devPtrDescaleK,
+                            devPtrDescaleV,
+                            devPtrDescaleS,
+                            devPtrScaleS,
+                            devPtrScaleO,
+                            devPtrAmaxO,
+                            devPtrAmaxS,
+                            devPtrQKVRaggedOffset,
+                            devPtrORaggedOffset,
+                            devPtrActualSeqlenOverride,
+                            CUDNN_DATA_FP8_E4M3);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(
+        cudaMemcpy(oTensor.hostPtr, oTensor.devPtr, sizeof(oTensor.hostPtr[0]) * oSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    if (devPtrActualSeqlenOverride) cudaFree(devPtrActualSeqlenOverride);
+    if (hostActualSeqlenOverride) free(hostActualSeqlenOverride);
+    if (hostPtrQKVRaggedOffset) free(hostPtrQKVRaggedOffset);
+    if (hostPtrORaggedOffset) free(hostPtrORaggedOffset);
+
+    std::cout << "\n========================================================================================\n";
+}
+
+TEST_CASE("FP8 Flash MHA Bprop sample", "[frontend][fusion][fp8flashmhaBprop]") {
+    std::cout << "TEST_CASE :: FP8 Flash MHA Bprop with backend API" << std::endl;
+    INFO("TEST_CASE :: FP8 Flash MHA Bprop with backend API");
+
+#if (CUDART_VERSION < 12000)
+    SKIP("Test requires CUDA version greater than 12.0");
+#endif
+
+    int64_t b    = 48;   // batch size
+    int64_t h    = 16;   // head dim
+    int64_t s_q  = 512;  // q tensor is padded to this seq length
+    int64_t s_kv = 512;  // k and v tensor is padded to this seq length
+    int64_t d    = 64;   // hidden dim
+
+    MHA_Layout layout = MHA_Layout::QKV_INTERLEAVED;  // layout of the tensors Q,K and V
+
+    float attnScale = 0.125f;  // scale value before softmax
+
+    float dropoutProbability = 0.0f;    // probability of dropout. If inference, dropout should be 0.0f
+    int64_t seed             = 123456;  // seed for generating the dropout mask
+
+    std::cout << "====PARAMETERS====" << std::endl;
+    std::cout << "batch is " << b << ", head dim is " << h << ", q sequence length is " << s_q
+              << ", kv sequence length is " << s_kv << ", hidden dim is " << d << std::endl;
+
+    void* devPtrQKV  = nullptr;  // QKV interleaved tensor
+    void* devPtrM    = nullptr;  // M tensor (row reduction max of QK.T)
+    void* devPtrZInv = nullptr;  // ZInv tensor (1 / row reduction sum of exponention of e^(x-M))
+    void* devPtrO    = nullptr;  // final output
+    void* devPtrdO   = nullptr;  // loss
+    void* devPtrdQKV = nullptr;  // dQKV interleaved tensor
+
+    int* devPtrActualSeqlenOverride = nullptr;  // actual seqlen override (MNK override)
+    int* devPtrQKVRaggedOffset      = nullptr;  // Offset overrides for QKV
+    int* devPtrORaggedOffset        = nullptr;  // Offset override for O
+
+    void* devPtrDropoutSeed   = nullptr;  // Dropout seed
+    void* devPtrDropoutOffset = nullptr;  // Dropout offset used in Philox RNG
+
+    // ================ FP8 tensors ===============================
+    void* devPtrDescaleQ  = nullptr;
+    void* devPtrDescaleK  = nullptr;
+    void* devPtrDescaleV  = nullptr;
+    void* devPtrDescaleO  = nullptr;
+    void* devPtrDescaledO = nullptr;
+    void* devPtrDescaleS  = nullptr;
+    void* devPtrDescaledS = nullptr;
+    void* devPtrScaleS    = nullptr;
+    void* devPtrScaledS   = nullptr;
+    void* devPtrScaledQ   = nullptr;
+    void* devPtrScaledK   = nullptr;
+    void* devPtrScaledV   = nullptr;
+    void* devPtrAmaxdS    = nullptr;
+    void* devPtrAmaxdQ    = nullptr;
+    void* devPtrAmaxdK    = nullptr;
+    void* devPtrAmaxdV    = nullptr;
+
+    int* hostActualSeqlenOverride = nullptr;
+    int* hostPtrQKVRaggedOffset   = nullptr;
+    int* hostPtrORaggedOffset     = nullptr;
+
+    // the setup is for the qkv interleaved layout (qkv interleaved assumes s_q = s_kv)
+    int64_t qkvTensorDim[] = {b, s_q, 3, h, d};
+    CUDNN_FRONTEND_UNUSED(qkvTensorDim);
+
+    /*********All FP8 Tensors have values initialized in range [0, 50]******************/
+
+    int64_t qkvSize = b * s_q * 3 * h * d;
+    // Make a surfaced with unsigned 8 bit int
+    Surface<uint8_t> qkvTensor(qkvSize, false);
+    devPtrQKV = (void*)qkvTensor.devPtr;
+    Surface<uint8_t> dQkvTensor(qkvSize, false);
+    devPtrdQKV = (void*)dQkvTensor.devPtr;
+
+    int64_t reductionSize = b * h * s_q;
+    Surface<float> mTensor(reductionSize, false);
+    Surface<float> zInvTensor(reductionSize, false);
+    devPtrM    = (void*)mTensor.devPtr;
+    devPtrZInv = (void*)zInvTensor.devPtr;
+
+    int64_t scalarSize = 1;
+
+    Surface<int64_t> dropoutSeed(scalarSize, false, seed);
+    devPtrDropoutSeed = (void*)dropoutSeed.devPtr;
+    Surface<int64_t> dropoutOffset(scalarSize, false, (int64_t)1);
+    devPtrDropoutOffset = (void*)dropoutOffset.devPtr;
+
+    Surface<float> descaleQ(scalarSize, false, 1.0f);
+    devPtrDescaleQ = (void*)descaleQ.devPtr;
+
+    Surface<float> descaleK(scalarSize, false, 1.0f);
+    devPtrDescaleK = (void*)descaleK.devPtr;
+
+    Surface<float> descaleV(scalarSize, false, 1.0f);
+    devPtrDescaleV = (void*)descaleV.devPtr;
+
+    Surface<float> descaleS(scalarSize, false, 1.0f);
+    devPtrDescaleS = (void*)descaleS.devPtr;
+
+    Surface<float> descaledS(scalarSize, false, 1.0f);
+    devPtrDescaledS = (void*)descaledS.devPtr;
+
+    Surface<float> descaleO(scalarSize, false, 1.0f);
+    devPtrDescaleO = (void*)descaleO.devPtr;
+
+    Surface<float> descaledO(scalarSize, false, 1.0f);
+    devPtrDescaledO = (void*)descaledO.devPtr;
+
+    Surface<float> scaleS(scalarSize, false, 1.0f);
+    devPtrScaleS = (void*)scaleS.devPtr;
+
+    Surface<float> scaledS(scalarSize, false, 1.0f);
+    devPtrScaledS = (void*)scaledS.devPtr;
+
+    Surface<float> scaledQ(scalarSize, false, 1.0f);
+    devPtrScaledQ = (void*)scaledQ.devPtr;
+
+    Surface<float> scaledK(scalarSize, false, 1.0f);
+    devPtrScaledK = (void*)scaledK.devPtr;
+
+    Surface<float> scaledV(scalarSize, false, 1.0f);
+    devPtrScaledV = (void*)scaledV.devPtr;
+
+    Surface<float> amaxdS(scalarSize, false, 0.0f);
+    devPtrAmaxdS = (void*)amaxdS.devPtr;
+
+    Surface<float> amaxdQ(scalarSize, false, 0.0f);
+    devPtrAmaxdQ = (void*)amaxdQ.devPtr;
+
+    Surface<float> amaxdK(scalarSize, false, 0.0f);
+    devPtrAmaxdK = (void*)amaxdK.devPtr;
+
+    Surface<float> amaxdV(scalarSize, false, 0.0f);
+    devPtrAmaxdV = (void*)amaxdV.devPtr;
+
+    // setup of actual seqlen Q and seqlen K and seqlen O
+    checkCudaErr(cudaMalloc((void**)&(devPtrActualSeqlenOverride), (b) * sizeof(devPtrActualSeqlenOverride[0])));
+    hostActualSeqlenOverride = (int*)calloc(b, sizeof(hostActualSeqlenOverride[0]));
+
+    for (int i = 0; i < b; i++) {
+        // random number between 16 and 512 for host seq len
+        hostActualSeqlenOverride[i] = rand() % (512 - 16 + 1) + 16;
+    }
+
+    checkCudaErr(cudaMemcpy(devPtrActualSeqlenOverride,
+                            hostActualSeqlenOverride,
+                            sizeof(hostActualSeqlenOverride[0]) * b,
+                            cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMalloc((void**)&(devPtrQKVRaggedOffset), (b + 1) * sizeof(devPtrQKVRaggedOffset[0])));
+    hostPtrQKVRaggedOffset = (int*)calloc(b + 1, sizeof(hostPtrQKVRaggedOffset[0]));  // ragged offset has b+1 elements
+
+    std::vector<int64_t> QKVprefixSum;
+    QKVprefixSum.resize(b + 1);
+    for (int i = 0; i < b + 1; i++) {
+        // Calculate prefix sum of hostActualSeqLenK
+        if (i == 0) {
+            QKVprefixSum[i] = 0;
+        } else {
+            QKVprefixSum[i] = QKVprefixSum[i - 1] + hostActualSeqlenOverride[i - 1];
+        }
+    }
+
+    int64_t offsetStride = h * d;
+    // Variable sequence lengths for QKV and O
+    for (int i = 0; i < b + 1; i++) {
+        hostPtrQKVRaggedOffset[i] = static_cast<int32_t>(3 * offsetStride * QKVprefixSum[i]);
+    }
+
+    checkCudaErr(cudaMemcpy(devPtrQKVRaggedOffset,
+                            hostPtrQKVRaggedOffset,
+                            sizeof(hostPtrQKVRaggedOffset[0]) * (b + 1),
+                            cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    checkCudaErr(cudaMalloc((void**)&(devPtrORaggedOffset), (b + 1) * sizeof(devPtrORaggedOffset[0])));
+    hostPtrORaggedOffset = (int*)calloc(b + 1, sizeof(hostPtrORaggedOffset[0]));  // ragged offset has b+1 elements
+
+    for (int i = 0; i < b + 1; i++) {
+        hostPtrORaggedOffset[i] = static_cast<int32_t>(offsetStride * QKVprefixSum[i]);
+    }
+
+    checkCudaErr(cudaMemcpy(
+        devPtrORaggedOffset, hostPtrORaggedOffset, sizeof(hostPtrORaggedOffset[0]) * (b + 1), cudaMemcpyHostToDevice));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    int64_t oSize = b * s_q * h * d;
+    Surface<uint8_t> oTensor(oSize, false);
+    devPtrO = (void*)oTensor.devPtr;
+
+    Surface<uint8_t> dOTensor(oSize, false);
+    devPtrdO = (void*)dOTensor.devPtr;
+
+    run_fp8_flash_mha_bprop(b,
+                            h,
+                            s_q,
+                            s_kv,
+                            d,
+                            attnScale,
+                            dropoutProbability,
+                            layout,
+                            devPtrQKV,
+                            devPtrM,
+                            devPtrZInv,
+                            devPtrO,
+                            devPtrdO,
+                            devPtrdQKV,
+                            devPtrDropoutSeed,
+                            devPtrDropoutOffset,
+                            devPtrDescaleQ,
+                            devPtrDescaleK,
+                            devPtrDescaleV,
+                            devPtrDescaleO,
+                            devPtrDescaledO,
+                            devPtrDescaleS,
+                            devPtrDescaledS,
+                            devPtrScaleS,
+                            devPtrScaledS,
+                            devPtrScaledQ,
+                            devPtrScaledK,
+                            devPtrScaledV,
+                            devPtrAmaxdS,
+                            devPtrAmaxdQ,
+                            devPtrAmaxdK,
+                            devPtrAmaxdV,
+                            devPtrQKVRaggedOffset,
+                            devPtrORaggedOffset,
+                            devPtrActualSeqlenOverride,
+                            CUDNN_DATA_FP8_E4M3);
+
+    checkCudaErr(cudaDeviceSynchronize());
+    checkCudaErr(cudaMemcpy(
+        dQkvTensor.hostPtr, dQkvTensor.devPtr, sizeof(dQkvTensor.hostPtr[0]) * qkvSize, cudaMemcpyDeviceToHost));
+    checkCudaErr(cudaDeviceSynchronize());
+
+    if (devPtrActualSeqlenOverride) cudaFree(devPtrActualSeqlenOverride);
+    if (hostActualSeqlenOverride) free(hostActualSeqlenOverride);
+    if (hostPtrQKVRaggedOffset) free(hostPtrQKVRaggedOffset);
+    if (hostPtrORaggedOffset) free(hostPtrORaggedOffset);
+
+    std::cout << "\n========================================================================================\n";
+}
+#endif
+
+#if (CUDNN_VERSION >= 8800)
+TEST_CASE("Batch normalization", "[frontend][fusion][bn]") {
+    std::cout << "\n========================================================================================\n";
+    std::cout << "Batch normalization" << std::endl;
+    // This  example shows CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR and
+    // CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR
+
+    // Here Channel count is output channel.
+
+    // Tensor dims are always NCHW, but stride layout may be NCHW or NHWC depending on how you configure it. The strides
+    // take care of it
+    constexpr int64_t num_peers = 2;
+    int64_t n                   = 8;
+    int64_t c                   = 32;
+    int64_t h                   = 16;
+    int64_t w                   = 16;
+    int64_t tensorDims[]        = {n, c, h, w};              // Input tensor dims (NCHW)
+    int64_t peerDims[]          = {num_peers, 4 * c, 1, 1};  // Peer stat tensor dims -> (Num GPUS, 2 * channel, 1, 1)
+    int64_t perChannelDims[]    = {1, c, 1, 1};              // Per channel sum (1, C, 1, 1)
+
+    int64_t epsilon[] = {1, 1, 1, 1};
+
+    auto size_calculator = [](int64_t* arr) { return std::accumulate(arr, arr + 4, 1LL, std::multiplies<int64_t>()); };
+
+    Surface<half> input(size_calculator(tensorDims), false);
+    Surface<half> output(size_calculator(tensorDims), false);
+
+    Surface<float> scale(size_calculator(perChannelDims), false);
+    Surface<float> bias(size_calculator(perChannelDims), false);
+
+    Surface<float> in_mean(size_calculator(perChannelDims), false);
+    Surface<float> in_var(size_calculator(perChannelDims), false);
+    Surface<float> out_mean(size_calculator(perChannelDims), false);
+    Surface<float> out_var(size_calculator(perChannelDims), false);
+    Surface<float> saved_mean(size_calculator(perChannelDims), false);
+    Surface<float> saved_inv_var(size_calculator(perChannelDims), false);
+
+    // Create two peer stat tensors for sample SGBN
+    Surface<float> peer_tensor1(size_calculator(peerDims), false, true);
+    Surface<float> peer_tensor2(size_calculator(peerDims), false, true);
+
+    Surface<float> bwd_peer_tensor1(size_calculator(peerDims), false, true);
+    Surface<float> bwd_peer_tensor2(size_calculator(peerDims), false, true);
+    // Example epsilon and decay values for batch normalization
+    double epsilon_val         = 0.000001;
+    double expAverageFactorVal = 0.3;
+    SECTION("Run batch normalization forward") {
+        // Sample to show that the plan can be cached and run multiple times
+        std::cout << "SECTION: RUNNING BATCH NORMALIZATION FORWARD" << std::endl;
+        std::map<std::vector<int64_t>, cudnn_frontend::ExecutionPlan> plan_cache;
+        cudnnHandle_t handle;
+        try {
+            auto plan = run_batch_norm_forward(handle, tensorDims, perChannelDims, epsilon, peerDims, CUDNN_DATA_HALF);
+            std::vector<int64_t> fv = {n, c, h, w, num_peers, (int)CUDNN_DATA_HALF};
+            plan_cache.insert(std::make_pair(fv, plan));
+            execute_batch_norm_forward(handle,
+                                       plan_cache.find(fv)->second,
+                                       input.devPtr,
+                                       output.devPtr,
+                                       scale.devPtr,
+                                       bias.devPtr,
+                                       in_mean.devPtr,
+                                       in_var.devPtr,
+                                       out_mean.devPtr,
+                                       out_var.devPtr,
+                                       saved_mean.devPtr,
+                                       saved_inv_var.devPtr,
+                                       peer_tensor1.devPtr,
+                                       peer_tensor2.devPtr,
+                                       epsilon_val,
+                                       expAverageFactorVal);
+        } catch (cudnn_frontend::cudnnException& e) {
+            struct cudaDeviceProp prop;
+            checkCudaErrors(cudaGetDeviceProperties(&prop, 0));
+            if (prop.major == 8) {
+                std::cout << "[ERROR] Exception " << e.what() << std::endl;
+                CHECK(false);
+            }
+        }
+
+        std::cout << "\n========================================================================================\n";
+    }
+
+    SECTION("Run batch normalization backward") {
+        Surface<float> dScale(size_calculator(perChannelDims), false);
+        Surface<float> dBias(size_calculator(perChannelDims), false);
+        Surface<half> dy(size_calculator(tensorDims), false);
+        Surface<half> dx(size_calculator(tensorDims), false);
+        std::cout << "SECTION: RUNNING BATCH NORMALIZATION BACKWARD" << std::endl;
+        run_batch_norm_backward(tensorDims,
+                                perChannelDims,
+                                epsilon,
+                                peerDims,
+                                input.devPtr,
+                                dy.devPtr,
+                                scale.devPtr,
+                                saved_mean.devPtr,
+                                saved_inv_var.devPtr,
+                                bwd_peer_tensor1.devPtr,
+                                bwd_peer_tensor2.devPtr,
+                                dx.devPtr,
+                                dScale.devPtr,
+                                dBias.devPtr,
+                                epsilon_val,
+                                CUDNN_DATA_HALF);
+        std::cout << "\n========================================================================================\n";
+    }
+    std::cout << "\n========================================================================================\n";
+}
+#endif
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/utils/error_util.h b/third_party/cudnn-frontend/samples/legacy_samples/utils/error_util.h
new file mode 100644
index 00000000..ff54adef
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/utils/error_util.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if !defined(_ERROR_UTIL_H_)
+#define _ERROR_UTIL_H_
+
+#include <functional>
+#include <sstream>
+#include <stdlib.h>
+#include <iostream>
+
+#include <cudnn_frontend.h>
+
+#define FatalError(s)                                                     \
+    {                                                                     \
+        std::stringstream _where, _message;                               \
+        _where << __FILE__ << ':' << __LINE__;                            \
+        _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \
+        std::cerr << _message.str() << "\nAborting...\n";                 \
+        cudaDeviceReset();                                                \
+        exit(EXIT_FAILURE);                                               \
+    }
+
+#define checkCudaErrors(status)                                              \
+    {                                                                        \
+        std::stringstream _error;                                            \
+        if (status != 0) {                                                   \
+            _error << "Cuda failure\nError: " << cudaGetErrorString(status); \
+            FatalError(_error.str());                                        \
+        }                                                                    \
+    }
+
+namespace cudnn_frontend {
+static inline void
+throw_if(std::function<bool()> expr, [[maybe_unused]] const char *message, [[maybe_unused]] cudnnStatus_t status) {
+    if (expr()) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnn_frontend::cudnnException(message, status);
+#endif
+    }
+}
+static inline void
+throw_if(bool expr, [[maybe_unused]] const char *message, [[maybe_unused]] cudnnStatus_t status) {
+    if (expr) {
+#ifndef NV_CUDNN_DISABLE_EXCEPTION
+        throw cudnn_frontend::cudnnException(message, status);
+#endif
+    }
+}
+}  // namespace cudnn_frontend
+
+#endif  // _ERROR_UTIL_H_
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/utils/fp16_dev.h b/third_party/cudnn-frontend/samples/legacy_samples/utils/fp16_dev.h
new file mode 100644
index 00000000..2719ea24
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/utils/fp16_dev.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if !defined(_FP16_DEV_H_)
+#define _FP16_DEV_H_
+
+#include "fp16_emu.h"
+
+template <class value_type>
+void
+gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut);
+
+#endif  // _FP16_DEV_H_
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/utils/fp16_emu.h b/third_party/cudnn-frontend/samples/legacy_samples/utils/fp16_emu.h
new file mode 100644
index 00000000..75099dd3
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/utils/fp16_emu.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#if !defined(_FP16_EMU_H_)
+#define _FP16_EMU_H_
+
+#include <driver_types.h>
+#include <cuda_fp16.h>
+
+// Necessary to ensure visibility of CUDART_VERSION macro
+#include <cuda_runtime_api.h>
+
+// Definition of '__half_raw' was not provided before CUDA 9.0.
+// '__half_raw' is our type where the unsigned 16-bit integer
+// data member 'x' can be accessed in both CUDA 9.0 and 8.0.
+#if CUDART_VERSION < 9000
+typedef __half __half_raw;
+#endif
+
+// Internally, in CUDNN we use half1 struct as the FP16 type.
+typedef __half half1;
+
+#define HLF_EPSILON 4.887581E-04
+#define HLF_MIN 6.103516E-05
+#define HLF_MAX 6.550400E+04
+
+half1
+cpu_float2half_rn(float f);
+
+float
+cpu_half2float(half1 h);
+
+static __inline__ __device__ __host__ half1
+habs(half1 h) {
+    // Add an indirection to get around type aliasing check
+    void* h_ptr   = &h;
+    __half_raw hr = *reinterpret_cast<__half_raw*>(h_ptr);
+    hr.x &= 0x7fffU;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+static __inline__ __device__ __host__ half1
+hneg(half1 h) {
+    // Add an indirection to get around type aliasing check
+    void* h_ptr   = &h;
+    __half_raw hr = *reinterpret_cast<__half_raw*>(h_ptr);
+    hr.x ^= 0x8000U;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+static __inline__ __device__ __host__ int
+ishnan(half1 h) {
+    // Add an indirection to get around type aliasing check
+    void* h_ptr   = &h;
+    __half_raw hr = *reinterpret_cast<__half_raw*>(h_ptr);
+    // When input is NaN, exponent is all ones and mantissa is non-zero.
+    return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0;
+}
+
+static __inline__ __device__ __host__ int
+ishinf(half1 h) {
+    // Add an indirection to get around type aliasing check
+    void* h_ptr   = &h;
+    __half_raw hr = *reinterpret_cast<__half_raw*>(h_ptr);
+    // When input is +/- inf, exponent is all ones and mantissa is zero.
+    return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0;
+}
+
+static __inline__ __device__ __host__ int
+ishequ(half1 x, half1 y) {
+    // Add an indirection to get around type aliasing check
+    void* x_ptr   = &x;
+    __half_raw xr = *reinterpret_cast<__half_raw*>(x_ptr);
+
+    // Add an indirection to get around type aliasing check
+    void* y_ptr   = &y;
+    __half_raw yr = *reinterpret_cast<__half_raw*>(y_ptr);
+
+    return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x;
+}
+
+// Returns 0.0000 in FP16 binary form
+static __inline__ __device__ __host__ half1
+hzero() {
+    __half_raw hr;
+    hr.x = 0x0000U;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+// Returns 1.0000 in FP16 binary form
+static __inline__ __device__ __host__ half1
+hone() {
+    __half_raw hr;
+    hr.x = 0x3c00U;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+// Returns quiet NaN, the most significant fraction bit #9 is set
+static __inline__ __device__ __host__ half1
+hnan() {
+    __half_raw hr;
+    hr.x = 0x7e00U;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+// Largest positive FP16 value, corresponds to 6.5504e+04
+static __inline__ __device__ __host__ half1
+hmax() {
+    // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
+    __half_raw hr;
+    hr.x = 0x7bffU;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+// Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05
+static __inline__ __device__ __host__ half1
+hmin() {
+    // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
+    __half_raw hr;
+    hr.x = 0x0400U;
+    // Add an indirection to get around type aliasing check
+    void* hr_ptr = &hr;
+    return *reinterpret_cast<half1*>(hr_ptr);
+}
+
+#endif  // _FP16_EMU_H_
diff --git a/third_party/cudnn-frontend/samples/legacy_samples/utils/helpers.h b/third_party/cudnn-frontend/samples/legacy_samples/utils/helpers.h
new file mode 100644
index 00000000..1ab02109
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/legacy_samples/utils/helpers.h
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#include <iostream>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn.h>
+
+#include "fp16_dev.h"
+#include "fp16_emu.h"
+#include "error_util.h"
+
+#define CUDNN_FRONTEND_UNUSED(X) ((void)X)
+
+#define THRESHOLD 2.0e-2
+
+enum class MHA_Layout { NOT_INTERLEAVED = 0, QKV_INTERLEAVED = 1, KV_INTERLEAVED = 2, SBH_INTERLEAVED = 3 };
+
+enum class MHA_Matrix {
+    Q_Matrix           = 0,  // queries
+    K_Matrix           = 1,  // keys
+    K_Matrix_Transpose = 2,  // keys tranposed
+    V_Matrix           = 3,  // values
+    V_Matrix_Transpose = 4,  // values transposed
+    S_Matrix           = 5,  // output of GEMM1
+    O_Matrix           = 6,  // final output
+};
+
+enum class MHA_Bias_Type { NO_BIAS = 0, PRE_SCALE_BIAS = 1, POST_SCALE_BIAS = 2 };
+
+#define CUDNN_CHECK(status)                                                                                     \
+    {                                                                                                           \
+        cudnnStatus_t err = status;                                                                             \
+        if (err != CUDNN_STATUS_SUCCESS) {                                                                      \
+            std::stringstream err_msg;                                                                          \
+            err_msg << "cuDNN Error: " << cudnnGetErrorString(err) << " (" << err << ") at " << __FILE__ << ":" \
+                    << __LINE__;                                                                                \
+            FAIL(err_msg.str());                                                                                \
+        }                                                                                                       \
+    }
+
+// Custom deleter for cudnnHandle_t
+struct CudnnHandleDeleter {
+    void
+    operator()(cudnnHandle_t* handle) const {
+        if (handle) {
+            CUDNN_CHECK(cudnnDestroy(*handle));
+            delete handle;
+        }
+    }
+};
+
+// Function to create a unique_ptr for cudnnHandle_t
+inline std::unique_ptr<cudnnHandle_t, CudnnHandleDeleter>
+create_cudnn_handle() {
+    auto handle = std::make_unique<cudnnHandle_t>();
+    CUDNN_CHECK(cudnnCreate(handle.get()));
+    return std::unique_ptr<cudnnHandle_t, CudnnHandleDeleter>(handle.release(), CudnnHandleDeleter());
+}
+
+bool
+is_ampere_arch();
+bool
+is_ada_arch();
+bool
+is_hopper_arch();
+bool
+is_blackwell_computing_arch();
+bool
+is_blackwell_gaming_arch();
+bool
+check_device_arch_newer_than(std::string const& arch);
+bool
+is_arch_supported_by_cudnn();
+
+int64_t
+getFwdConvDilatedFilterDim(int64_t filterDim, int64_t dilation);
+int64_t
+getFwdConvPaddedImageDim(int64_t tensorDim, int64_t pad);
+int64_t
+getFwdConvOutputDim(int64_t tensorDim, int64_t pad, int64_t filterDim, int64_t stride, int64_t dilation);
+
+void
+generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat);
+void
+generate4dTransposeStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat);
+void
+generateMHAStrides(int64_t b,
+                   int64_t h,
+                   int64_t s_q,
+                   int64_t s_kv,
+                   int64_t d,
+                   int64_t* strideA,
+                   MHA_Layout layout,
+                   MHA_Matrix matrix);
+
+int64_t
+checkCudaError(cudaError_t code, const char* expr, const char* file, int line);
+int64_t
+checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line);
+
+void
+lin2dim(int64_t id, int64_t* ids, const int64_t* dims, int64_t length);
+int64_t
+dim2lin(const int64_t* ids, const int64_t* strides, int64_t length);
+
+void
+initImage(float* image, int64_t imageSize);
+void
+initImage(half1* image, int64_t imageSize);
+void
+testinitImage(half1* image, int64_t imageSize, int test);
+void
+initImage(int8_t* image, int64_t imageSize);
+void
+initImage(uint8_t* image, int64_t imageSize);
+void
+initImage(int32_t* image, int64_t imageSize);
+void
+initImage(int64_t* image, int64_t imageSize);
+void
+initImage(bool* image, int64_t imageSize);
+void
+initImagePadded(int8_t* image, int64_t dimA[], int64_t dimPadded[], int64_t stridePadded[], cudnnDataType_t dataType);
+
+void
+doEpilog(float* out, int64_t idx, float alphaAcc, float beta);
+void
+doEpilog(half1* out, int64_t idx, float alphaAcc, float beta);
+void
+doEpilog(int8_t* out, int64_t idx, int32_t alphaAcc, float beta);
+
+float
+getError(float dev, float ref);
+float
+getError(half1 dev, half1 ref);
+int8_t
+getError(int8_t dev, int8_t ref);
+
+static float
+doFma(float fval, float ival, float tmp) {
+    return fval * ival + tmp;
+}
+
+static float
+doFma(half1 fval, half1 ival, float tmp) {
+    return cpu_half2float(fval) * cpu_half2float(ival) + tmp;
+}
+
+static int32_t
+doFma(int8_t fval, int8_t ival, int32_t tmp) {
+    return int32_t(fval) * int32_t(ival) + tmp;
+}
+
+// Garbage function, resolves overloaded function ambiguity for an invalid type combination
+static int32_t
+doFma(float fval, float ival, int32_t tmp) {
+    (void)fval;
+    (void)ival;
+    (void)tmp;
+    return 0;
+}
+
+// Garbage function, resolves overloaded function ambiguity for an invalid type combination
+static int32_t
+doFma(half1 fval, half1 ival, int32_t tmp) {
+    (void)fval;
+    (void)ival;
+    (void)tmp;
+    return 0;
+}
+
+// Garbage function, resolves overloaded function ambiguity for an invalid type combination
+static float
+doFma(int8_t fval, int8_t ival, float tmp) {
+    (void)fval;
+    (void)ival;
+    (void)tmp;
+    return 0;
+}
+
+#define checkCudaErr(...)                                                            \
+    do {                                                                             \
+        int64_t err = checkCudaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
+        REQUIRE(err == 0);                                                           \
+    } while (0)
+
+#define checkCudnnErr(...)                                                            \
+    do {                                                                              \
+        int64_t err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
+        REQUIRE(err == 0);                                                            \
+    } while (0)
+
+template <typename T_ELEM>
+class SurfaceManager {
+   public:
+    T_ELEM* devPtrX         = NULL;
+    T_ELEM* devPtrW         = NULL;
+    T_ELEM* devPtrY         = NULL;
+    T_ELEM* devPtrZ         = NULL;
+    T_ELEM* devPtrB         = NULL;
+    T_ELEM* devPtrAfterAdd  = NULL;
+    T_ELEM* devPtrAfterConv = NULL;
+    T_ELEM* devPtrAfterBias = NULL;
+
+    T_ELEM* hostX         = NULL;
+    T_ELEM* hostW         = NULL;
+    T_ELEM* hostY         = NULL;
+    T_ELEM* hostZ         = NULL;
+    T_ELEM* hostB         = NULL;
+    T_ELEM* hostAfterAdd  = NULL;
+    T_ELEM* hostAfterConv = NULL;
+    T_ELEM* hostAfterBias = NULL;
+    T_ELEM* host_ref      = NULL;
+
+    explicit SurfaceManager(int64_t Xsize, int64_t Wsize, int64_t Ysize, int64_t ref_size) {
+        checkCudaErr(cudaMalloc((void**)&(devPtrX), size_t((Xsize) * sizeof(devPtrX[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrW), size_t((Wsize) * sizeof(devPtrW[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrY), size_t((Ysize) * sizeof(devPtrY[0]))));
+
+        hostX    = (T_ELEM*)calloc(size_t(Xsize), sizeof(hostX[0]));
+        hostW    = (T_ELEM*)calloc(size_t(Wsize), sizeof(hostW[0]));
+        hostY    = (T_ELEM*)calloc(size_t(Ysize), sizeof(hostY[0]));
+        host_ref = (T_ELEM*)calloc(size_t(ref_size), sizeof(host_ref[0]));
+
+        initImage(hostX, Xsize);
+        initImage(hostW, Wsize);
+        initImage(hostY, Ysize);
+
+        checkCudaErr(cudaMemcpy(devPtrX, hostX, size_t(sizeof(hostX[0]) * Xsize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(devPtrW, hostW, size_t(sizeof(hostW[0]) * Wsize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(devPtrY, hostY, size_t(sizeof(hostY[0]) * Ysize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaDeviceSynchronize());
+    }
+
+    explicit SurfaceManager(int64_t Xsize, int64_t Wsize, int64_t Ysize, int64_t Bsize, bool isConvBiasAdd) {
+        (void)isConvBiasAdd;
+
+        checkCudaErr(cudaMalloc((void**)&(devPtrX), size_t((Xsize) * sizeof(devPtrX[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrW), size_t((Wsize) * sizeof(devPtrW[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrY), size_t((Ysize) * sizeof(devPtrY[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrZ), size_t((Ysize) * sizeof(devPtrZ[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrB), size_t((Bsize) * sizeof(devPtrB[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrAfterConv), size_t((Ysize) * sizeof(devPtrAfterConv[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrAfterAdd), size_t((Ysize) * sizeof(devPtrAfterAdd[0]))));
+        checkCudaErr(cudaMalloc((void**)&(devPtrAfterBias), size_t((Ysize) * sizeof(devPtrAfterBias[0]))));
+
+        hostX         = (T_ELEM*)calloc(size_t(Xsize), sizeof(hostX[0]));
+        hostW         = (T_ELEM*)calloc(size_t(Wsize), sizeof(hostW[0]));
+        hostY         = (T_ELEM*)calloc(size_t(Ysize), sizeof(hostY[0]));
+        hostZ         = (T_ELEM*)calloc(size_t(Ysize), sizeof(hostZ[0]));
+        hostB         = (T_ELEM*)calloc(size_t(Bsize), sizeof(hostB[0]));
+        hostAfterConv = (T_ELEM*)calloc(size_t(Ysize), sizeof(hostAfterConv[0]));
+        hostAfterAdd  = (T_ELEM*)calloc(size_t(Ysize), sizeof(hostAfterAdd[0]));
+        hostAfterBias = (T_ELEM*)calloc(size_t(Ysize), sizeof(hostAfterBias[0]));
+        host_ref      = (T_ELEM*)calloc(size_t(Ysize), sizeof(host_ref[0]));
+
+        initImage(hostX, Xsize);
+        initImage(hostW, Wsize);
+        initImage(hostY, Ysize);
+        initImage(hostZ, Ysize);
+        initImage(hostB, Bsize);
+        initImage(hostAfterAdd, Ysize);
+        initImage(hostAfterBias, Ysize);
+        initImage(hostAfterConv, Ysize);
+
+        checkCudaErr(cudaMemcpy(devPtrX, hostX, (size_t)(sizeof(hostX[0]) * Xsize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(devPtrW, hostW, (size_t)(sizeof(hostW[0]) * Wsize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(devPtrY, hostY, (size_t)(sizeof(hostY[0]) * Ysize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(devPtrZ, hostZ, (size_t)(sizeof(hostZ[0]) * Ysize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(devPtrB, hostB, (size_t)(sizeof(hostB[0]) * Bsize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(
+            devPtrAfterAdd, hostAfterAdd, (size_t)(sizeof(hostAfterAdd[0]) * Ysize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(
+            devPtrAfterBias, hostAfterBias, (size_t)(sizeof(hostAfterBias[0]) * Ysize), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaMemcpy(
+            devPtrAfterConv, hostAfterConv, (size_t)(sizeof(hostAfterConv[0]) * Ysize), cudaMemcpyHostToDevice));
+
+        checkCudaErr(cudaDeviceSynchronize());
+    }
+
+    ~SurfaceManager() {
+        if (devPtrX) cudaFree(devPtrX);
+        if (devPtrW) cudaFree(devPtrW);
+        if (devPtrY) cudaFree(devPtrY);
+        if (devPtrZ) cudaFree(devPtrZ);
+        if (devPtrB) cudaFree(devPtrB);
+        if (devPtrAfterAdd) cudaFree(devPtrAfterAdd);
+        if (devPtrAfterBias) cudaFree(devPtrAfterBias);
+        if (devPtrAfterConv) cudaFree(devPtrAfterConv);
+
+        if (hostX) free(hostX);
+        if (hostW) free(hostW);
+        if (hostY) free(hostY);
+        if (hostZ) free(hostZ);
+        if (hostB) free(hostB);
+        if (hostAfterAdd) free(hostAfterAdd);
+        if (hostAfterBias) free(hostAfterBias);
+        if (hostAfterConv) free(hostAfterConv);
+        if (host_ref) free(host_ref);
+    }
+};
+
+template <typename T_ELEM>
+struct Surface {
+    T_ELEM* devPtr  = NULL;
+    T_ELEM* hostPtr = NULL;
+    int64_t n_elems = 0;
+
+   protected:
+    explicit Surface() {}
+
+   public:
+    explicit Surface(int64_t n_elems, [[maybe_unused]] bool hasRef) : n_elems(n_elems) {
+        checkCudaErr(cudaMalloc((void**)&(devPtr), (size_t)((n_elems) * sizeof(devPtr[0]))));
+        hostPtr = (T_ELEM*)calloc((size_t)n_elems, sizeof(hostPtr[0]));
+        initImage(hostPtr, n_elems);
+        checkCudaErr(cudaMemcpy(devPtr, hostPtr, size_t(sizeof(hostPtr[0]) * n_elems), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaDeviceSynchronize());
+    }
+
+    explicit Surface(int64_t n_elems, [[maybe_unused]] bool hasRef, bool isInterleaved) {
+        (void)isInterleaved;
+        checkCudaErr(cudaMalloc((void**)&(devPtr), (n_elems) * sizeof(devPtr[0])));
+        hostPtr = (T_ELEM*)calloc(n_elems, sizeof(hostPtr[0]));
+        initImage(hostPtr, n_elems);
+        uint32_t* temp = (uint32_t*)hostPtr;
+        for (auto i = 0; i < n_elems; i = i + 2) {
+            temp[i + 1] = 1u;
+        }
+
+        checkCudaErr(cudaMemcpy(devPtr, hostPtr, size_t(sizeof(hostPtr[0]) * n_elems), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaDeviceSynchronize());
+    }
+
+    explicit Surface(int64_t size, [[maybe_unused]] bool hasRef, T_ELEM fillValue) : n_elems(size) {
+        checkCudaErr(cudaMalloc((void**)&(devPtr), (size) * sizeof(devPtr[0])));
+        hostPtr = (T_ELEM*)calloc(size, sizeof(hostPtr[0]));
+        for (int i = 0; i < size; i++) {
+            hostPtr[i] = fillValue;
+        }
+        checkCudaErr(cudaMemcpy(devPtr, hostPtr, sizeof(hostPtr[0]) * n_elems, cudaMemcpyHostToDevice));
+        checkCudaErr(cudaDeviceSynchronize());
+    }
+
+    Surface(const Surface& other) : n_elems(n_elems) {
+        checkCudaErr(cudaMalloc((void**)&(devPtr), (size_t)((n_elems) * sizeof(devPtr[0]))));
+        hostPtr = (T_ELEM*)calloc((size_t)n_elems, sizeof(hostPtr[0]));
+        std::copy(other.hostPtr, other.hostPtr + n_elems, hostPtr);
+        checkCudaErr(cudaMemcpy(devPtr, hostPtr, size_t(sizeof(hostPtr[0]) * n_elems), cudaMemcpyHostToDevice));
+        checkCudaErr(cudaDeviceSynchronize());
+    }
+
+    Surface(Surface&& other) noexcept : Surface() { swap(*this, other); }
+
+    Surface&
+    operator=(Surface other) {
+        swap(*this, other);
+
+        return *this;
+    }
+
+    friend void
+    swap(Surface& first, Surface& second) {
+        using std::swap;
+
+        swap(first.n_elems, second.n_elems);
+        swap(first.hostPtr, second.hostPtr);
+        swap(first.devPtr, second.devPtr);
+    }
+
+    ~Surface() {
+        if (devPtr) {
+            cudaFree(devPtr);
+            devPtr = nullptr;
+        }
+        if (hostPtr) {
+            free(hostPtr);
+            hostPtr = nullptr;
+        }
+    }
+};
diff --git a/third_party/cudnn-frontend/samples/llama/100_download_weight.py b/third_party/cudnn-frontend/samples/llama/100_download_weight.py
new file mode 100644
index 00000000..1759a524
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/100_download_weight.py
@@ -0,0 +1,21 @@
+"""This script downloads the Llama 3.1 8B model weights from Hugging Face and saves them in a PyTorch .pt file.
+The weights are saved in a PyTorch .pt file named "llama3.1_8b_weights.bf16.pt".
+"""
+
+import os
+
+if "HF_HOME" not in os.environ:
+    print("HF_HOME not set. Default may be ~/.cache/huggingface")
+if "HF_TOKEN" not in os.environ and "HF_TOKEN_PATH" not in os.environ:
+    print("HF_TOKEN and HF_TOKEN_PATH not set. You may not be able to download the weights.")
+
+import torch
+from transformers import LlamaModel
+
+# Load the model with the downloaded weights in bfloat16 precision
+REPO_ID = "meta-llama/Llama-3.1-8B"
+model = LlamaModel.from_pretrained(REPO_ID, torch_dtype=torch.bfloat16)
+
+# Save the model weights in a PyTorch .pt file
+torch.save(model.state_dict(), "llama3.1_8b_weights.bf16.pt")
+print("Model weights saved")
diff --git a/third_party/cudnn-frontend/samples/llama/101_hf_llama_tieout.py b/third_party/cudnn-frontend/samples/llama/101_hf_llama_tieout.py
new file mode 100644
index 00000000..dac59bfa
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/101_hf_llama_tieout.py
@@ -0,0 +1,61 @@
+"""This script creates a PyTorch implementation of the Llama 3.1 8B model
+using Hugging Face transformers library and loads the pretrained weights.
+
+Then randomize an input tensor of shape (3,10) of integer token IDs, processed
+through the model, and perform a backward pass.
+
+This script saves into "tensors-bf16-tieout.pt" the:
+- integer input tensor
+- RoPE sine and cosine values
+- inverse frequency values used in RoPE
+- model output tensor (final hidden state)
+- list of the hidden states to all transformer layers
+- a random target tensor used to compute the MSE loss
+- the gradient of the entry embed layer
+- the gradient of the output norm layer
+"""
+
+import torch
+from transformers import LlamaModel, AutoConfig
+
+# Load the model with the downloaded weights in bfloat16 precision
+torch.set_default_device("cuda")
+REPO_ID = "meta-llama/Llama-3.1-8B"
+config = AutoConfig.from_pretrained(REPO_ID)
+print(config)
+model = LlamaModel(config).to(torch.bfloat16)
+model.load_state_dict(torch.load("llama3.1_8b_weights.bf16.pt", map_location="cuda"), strict=False)
+print(model)
+
+# Run a forward pass
+BS, SEQ_LEN = 3, 10
+x = torch.randint(0, model.config.vocab_size, (BS, SEQ_LEN))
+y = model.forward(x, output_hidden_states=True, output_attentions=False)
+print("Output shape:", y[0].shape, y[0].dtype)  # Print the shape of hidden states
+assert y[0].dtype == torch.bfloat16, "Output should be in bfloat16 precision"
+assert len(y[-1]) == 33, "Llama 8B with 32 layers should have 33 hidden states"
+
+# Extract RoPE sine and cosine values
+x_embed = model.embed_tokens(x)
+position_ids = torch.arange(SEQ_LEN, device=x.device).unsqueeze(0).to(torch.float32)
+x_rope = model.rotary_emb(x_embed, position_ids)  # get sine and cosine values
+inv_freq = model.rotary_emb.inv_freq
+assert model.rotary_emb.attention_scaling == 1.0, "Attention scaling in Llama 8B should be 1.0"
+
+# Run a backward pass
+# MSE as loss function to take every element into account
+target = torch.randn_like(y[0])  # random target tensor
+criterion = torch.nn.MSELoss()
+loss = criterion(y[0], target)
+loss.backward()
+print(
+    "Gradient shape:",
+    model.embed_tokens.weight.grad.shape,
+    model.embed_tokens.weight.grad.dtype,
+)
+grad_embed = model.embed_tokens.weight.grad  # grad on entry embed layer
+grad_norm = model.norm.weight.grad  # grad on output norm layer
+
+# save tensors to file
+tensors = [x, x_rope, inv_freq, y[0], y[-1], target, grad_embed, grad_norm]
+torch.save(tensors, "tensors-bf16-tieout.pt")
diff --git a/third_party/cudnn-frontend/samples/llama/102_torch_llama_tieout.py b/third_party/cudnn-frontend/samples/llama/102_torch_llama_tieout.py
new file mode 100644
index 00000000..ed6793a5
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/102_torch_llama_tieout.py
@@ -0,0 +1,388 @@
+"""Build a concise PyTorch implementation of the Llama 3.1 8B model that can seamlessly
+accept the LlamaModel model weight from Hugging Face.
+
+This implementation follows the architecture described in the LLaMA paper:
+"LLaMA: Open and Efficient Foundation Language Models" (https://arxiv.org/abs/2302.13971)
+and "The Llama 3 Herd of Models" (https://arxiv.org/abs/2407.21783)
+
+Key architectural features:
+- Pre-normalization using RMSNorm
+- SwiGLU activation in the MLP
+- Rotary Position Embeddings (RoPE)
+- Grouped Query Attention for efficient computation
+"""
+
+import dataclasses
+import math
+import time
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Linear, RMSNorm
+
+
+# For type annotations
+Tensor = torch.Tensor
+
+
+def report_close(actual: Tensor, expected: Tensor, atol: float, rtol: float) -> str:
+    """Similar to torch.testing.assert_close, but reports the percentage of mismatches instead of
+    raising an exception.
+    """
+    # find the positions where the actual and expected values are close
+    close_mask = torch.isclose(actual, expected, atol=atol, rtol=rtol, equal_nan=True)
+    # compute the percentage of close positions
+    num_el = actual.numel()
+    close_cnt = close_mask.detach().sum().cpu().item()
+    # compute the max diff
+    max_diff = (actual - expected).detach().abs().max().cpu().item()
+    # print the results
+    result = f"{100 * close_cnt / num_el:.1f}% close at atol={atol} rtol={rtol}, max diff={max_diff}"
+    return result
+
+
+@dataclasses.dataclass
+class LlamaConfig:
+    """Configuration class for LLaMA model hyperparameters.
+
+    This matches the configuration of LLaMA 3.1 8B model with:
+    - 32 transformer layers
+    - 4096 hidden dimension
+    - 32 attention heads
+    - Grouped-query attention with 8 key-value heads
+    """
+
+    vocab_size: int = 128256  # Size of the tokenizer vocabulary
+    max_position_embeddings: int = 131072  # Maximum sequence length
+    hidden_size: int = 4096  # Dimension of hidden layers
+    intermediate_size: int = 14336  # Dimension of MLP's hidden layer
+    num_hidden_layers: int = 32  # Number of transformer layers
+    num_attention_heads: int = 32  # Number of attention heads
+    num_key_value_heads: int = 8  # Number of key-value heads for GQA
+    rms_norm_eps: float = 1e-5  # Epsilon for RMSNorm
+
+
+def rotate_half(x: Tensor) -> Tensor:
+    """Rotates half the hidden dims of the input.
+
+    This is a helper function for rotary position embeddings (RoPE).
+    For a tensor of shape (..., d), it returns a tensor where the last
+    d/2 dimensions are rotated by swapping and negating.
+
+    Args:
+        x: Input tensor of shape (..., d)
+
+    Returns:
+        Tensor of same shape with rotated last dimension
+    """
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)  # Concatenate with rotation
+
+
+def apply_rotary_pos_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor:
+    """Apply rotary position embeddings to a tensor.
+
+    RoPE performs rotation in vector space based on position using
+    trigonometric functions. This allows the model to learn relative
+    positions in a more efficient way than absolute position embeddings.
+
+    Args:
+        x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        cos: Cosine position embeddings matching the shape of x
+        sin: Sine position embeddings matching the shape of x
+    """
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+def get_inv_freq(N: float, dim: int) -> Tensor:
+    """Get the inverse frequency for the RoPE with the Llama 3.1 scaling.
+    Always computed in float32
+
+    Args:
+        N: Base, a large number
+        dim: Size of hidden dimension, should be divisible by 2
+    """
+    N = float(N)
+    dim = int(dim)
+    # Llama 3.1 RoPE parameters
+    factor = 8.0
+    low_freq, high_freq = 1.0, 4.0
+    context_len = 8192
+    # Compute the inverse frequency based on the standard RoPE formula
+    inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2).float().to("cuda") / dim))
+    # Compute the modified inverse frequency, then derive the smoothed inverse frequency
+    wavelen = 2 * math.pi / inv_freq
+    max_wavelen = context_len / low_freq
+    min_wavelen = context_len / high_freq
+    inv_freq = torch.where(wavelen > max_wavelen, inv_freq / factor, inv_freq)
+    smooth_factor = (context_len / wavelen - low_freq) / (high_freq - low_freq)
+    smoothed = (1 - smooth_factor) * inv_freq / factor + smooth_factor * inv_freq
+    # Output inverse frequency as a mix of the two
+    is_medium_freq = ~(wavelen < min_wavelen) * ~(wavelen > max_wavelen)
+    inv_freq_final = torch.where(is_medium_freq, smoothed, inv_freq)
+    return inv_freq_final
+
+
+class RotaryPositionEncoding(nn.Module):
+    """Rotary position encoding."""
+
+    def __init__(self, dim: int, max_position_embeddings: int) -> None:
+        """Initialize the RotaryPositionEncoding module
+
+        Args:
+            dim: The hidden dimension of the input tensor to which RoPE is applied
+            max_position_embeddings: The maximum sequence length of the input tensor
+        """
+        super().__init__()
+        # compute a matrix of n\theta_i
+        N = 500_000.0
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.inv_freq = get_inv_freq(N, dim)
+        position = torch.arange(max_position_embeddings).float().to("cuda")
+        inv_freq = torch.cat((self.inv_freq, self.inv_freq), dim=-1)
+        sinusoid_inp = torch.outer(position, inv_freq)
+        # save cosine and sine matrices as buffers, not parameters
+        self.register_buffer("cos", sinusoid_inp.cos())
+        self.register_buffer("sin", sinusoid_inp.sin())
+
+    def __repr__(self) -> str:
+        return f"RotaryPositionEncoding(dim={self.dim}, max_position_embeddings={self.max_position_embeddings})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply RoPE to tensor x
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+
+        Returns:
+            Output tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        """
+        dtype = x.dtype
+        seq_len = x.shape[1]
+        # transform the cosine and sine matrices to 4D tensor and the same dtype as x
+        cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        # apply RoPE to x
+        return apply_rotary_pos_emb(x, cos, sin)
+
+
+class LlamaMLP(nn.Module):
+    """MLP layer with SwiGLU activation.
+
+    The architecture follows:
+    1. Project input to intermediate size through two parallel layers
+    2. Apply SwiGLU activation (multiply gate and up-projected inputs)
+    3. Project back to hidden size
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        # Two parallel projections for SwiGLU
+        self.gate_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        # Project back to hidden size
+        self.down_proj = Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = F.silu  # SwiGLU activation function
+
+    def forward(self, x: Tensor) -> Tensor:
+        # SwiGLU activation: multiply gate and up-projected inputs
+        gate = self.act_fn(self.gate_proj(x))
+        up = self.up_proj(x)
+        return self.down_proj(gate * up)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-head attention with grouped-query attention and rotary embeddings.
+
+    Grouped-query attention reduces computation by using fewer key-value heads
+    than query heads, then sharing the same key-value heads across multiple queries.
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError("hidden_size must be divisible by num_heads")
+
+        # Linear layers for Q, K, V projections
+        self.q_proj = Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        # Project inputs to Q, K, V
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+
+        # Apply rotary position embeddings
+        if rope is not None:
+            query_states = rope(query_states)
+            key_states = rope(key_states)
+
+        # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # Use PyTorch's optimized attention implementation
+        # setting is_causal=True is incompatible with setting explicit attention mask
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            dropout_p=0.0,
+            is_causal=True,
+            enable_gqa=True,
+        )
+
+        # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, and then project output
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class LlamaDecoderLayer(nn.Module):
+    """Single transformer layer for LLaMA.
+
+    Architecture:
+    1. Input -> RMSNorm -> Self-Attention -> Residual
+    2. RMSNorm -> MLP -> Residual
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = LlamaAttention(config)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        # First residual block: Self-attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(hidden_states=hidden_states, rope=rope)
+        hidden_states = attn_outputs + residual
+
+        # Second residual block: MLP
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states) + residual
+        return hidden_states
+
+
+class LlamaModel(nn.Module):
+    """The full Llama model."""
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.rotary_emb = RotaryPositionEncoding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings,
+        )
+
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        # Stack of transformer layers
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final layer norm
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        output_hidden_states: bool = False,
+    ) -> Tuple[Tensor, Optional[Tuple[Tensor, ...]]]:
+        # Convert input token IDs to embeddings
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Initialize list to collect hidden states if requested
+        all_hidden_states = () if output_hidden_states else None
+
+        # Process through all transformer layers, accumulating hidden states as the input to each layer
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            hidden_states = layer(hidden_states, rope=self.rotary_emb)
+
+        # Final layer norm, accumulate as the final hidden state
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # Return tuple of the output and the list of hidden states from all layers
+        return [hidden_states, all_hidden_states]
+
+
+# Create model with default config
+test_config = LlamaConfig()
+torch.set_default_device("cuda")
+model = LlamaModel(test_config).to(torch.bfloat16)
+print(time.time(), "model created")
+state_dict = torch.load("llama3.1_8b_weights.bf16.pt", map_location="cuda")
+print(time.time(), "state_dict loaded from disk")
+model.load_state_dict(state_dict, strict=False)
+print(time.time(), "model loaded")
+del state_dict
+print(model)
+print()
+
+# load sample input and output tensors from 101_hf_llama_tieout.py
+tensors = torch.load("tensors-bf16-tieout.pt", map_location="cuda")
+x, rope_ref, inv_freq_ref, y_ref, hidden_states_ref, target, grad_embed_ref, grad_norm_ref = tensors
+
+# trial run one forward & backward pass
+epoch = time.time()
+y, hidden_states = model.forward(x, output_hidden_states=True)
+print(time.time(), f"forward pass finished in {time.time() - epoch:.5f} sec")
+criterion = torch.nn.MSELoss()
+assert y.shape == target.shape, f"y.shape={y.shape} not the same as target.shpae={target.shape}"
+loss = criterion(y, target)
+epoch = time.time()
+loss.backward()
+print(time.time(), f"backward pass finished in {time.time() - epoch:.5f} sec")
+grad_embed = model.embed_tokens.weight.grad
+grad_norm = model.norm.weight.grad
+
+# compare results
+x_embed = model.embed_tokens(x)
+inv_freq = model.rotary_emb.inv_freq.type_as(inv_freq_ref)
+x_rope = (
+    model.rotary_emb.cos[: x.shape[1]].unsqueeze(0),
+    model.rotary_emb.sin[: x.shape[1]].unsqueeze(0),
+)
+
+print()
+print("Numerical difference compared to reference implementation:")
+print("RoPE cosine:", report_close(x_rope[0], rope_ref[0], atol=1e-3, rtol=1e-3))
+print("RoPE sine:", report_close(x_rope[1], rope_ref[1], atol=1e-3, rtol=1e-3))
+print("inv_freq:", report_close(inv_freq, inv_freq_ref, atol=0, rtol=0))
+print()
+for i in range(len(hidden_states)):
+    print(
+        f"output of layer {i}:",
+        report_close(hidden_states[i], hidden_states_ref[i], atol=1e-2, rtol=1e-2),
+    )
+print()
+print("final norm grad:", report_close(grad_norm, grad_norm_ref, atol=1e-2, rtol=1e-2))
+print("embed grad:", report_close(grad_embed, grad_embed_ref, atol=1e-2, rtol=1e-2))
diff --git a/third_party/cudnn-frontend/samples/llama/103_cudnn_llama_tieout.py b/third_party/cudnn-frontend/samples/llama/103_cudnn_llama_tieout.py
new file mode 100644
index 00000000..0351718c
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/103_cudnn_llama_tieout.py
@@ -0,0 +1,846 @@
+"""Build a concise PyTorch implementation of the Llama 3.1 8B model with cuDNN optimizations
+that can seamlessly accept the LlamaModel model weight from Hugging Face.
+
+This implementation follows the architecture described in the LLaMA paper:
+"LLaMA: Open and Efficient Foundation Language Models" (https://arxiv.org/abs/2302.13971)
+and "The Llama 3 Herd of Models" (https://arxiv.org/abs/2407.21783)
+
+Key architectural features:
+- Pre-normalization using RMSNorm
+- SwiGLU activation in the MLP
+- Rotary Position Embeddings (RoPE)
+- Grouped Query Attention for efficient computation
+
+This implementation uses custom PyTorch functions using cuDNN for the use case of Llama model.
+See <https://docs.pytorch.org/docs/stable/autograd.html#function> and
+<https://docs.pytorch.org/docs/stable/notes/extending.html> for details in extending PyTorch
+using torch.autograd.function.Function.
+"""
+
+import dataclasses
+import functools
+import math
+import time
+from typing import Any, Optional, Tuple
+
+import cudnn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# from torch.nn import Linear, RMSNorm   # Commented out. Using custom class as drop-in replacement
+from torch.autograd.function import Function
+
+
+# For type annotations
+Tensor = torch.Tensor
+
+
+def report_close(actual: Tensor, expected: Tensor, atol: float, rtol: float) -> str:
+    """Similar to torch.testing.assert_close, but reports the percentage of mismatches instead of
+    raising an exception.
+    """
+    # find the positions where the actual and expected values are close
+    close_mask = torch.isclose(actual, expected, atol=atol, rtol=rtol, equal_nan=True)
+    # compute the percentage of close positions
+    num_el = actual.numel()
+    close_cnt = close_mask.detach().sum().cpu().item()
+    # compute the max diff
+    max_diff = (actual - expected).detach().abs().max().cpu().item()
+    # print the results
+    result = f"{100 * close_cnt / num_el:.1f}% close at atol={atol} rtol={rtol}, max diff={max_diff}"
+    return result
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_matmul(
+    x_dim: Tuple[int, ...],
+    x_stride: Tuple[int, ...],
+    w_dim: Tuple[int, ...],
+    w_stride: Tuple[int, ...],
+    dtype: torch.dtype,
+) -> cudnn.Graph:
+    """For use in the replacement of PyTorch nn.Linear module. To compute Y = X @ W.
+    Create a cuDNN graph for matmul between a tensor "x" of shape (1, m, n) and a
+    tensor "w" of shape (1, n, k) to produce a tensor "y" of shape (1, m, k).
+    """
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        compute_data_type=dtype,
+        inputs=["X", "W"],
+        outputs=["Y"],
+    ) as graph:
+        X = graph.tensor(name="X", dim=x_dim, stride=x_stride)
+        W = graph.tensor(name="W", dim=w_dim, stride=w_stride)
+        Y = graph.matmul(name="mm", A=X, B=W)
+        Y.set_output(True).set_name("Y")
+    return graph
+
+
+class CudnnMatmul(Function):
+    """Custom PyTorch matmul function using cuDNN for the use case of Llama model"""
+
+    @staticmethod
+    def forward(x: Tensor, w: Tensor) -> Tensor:
+        """Matmul function: Y = X @ W.T
+
+        X is a 3D tensor of shape (batch_size, seq_length, in_features)
+        W is a 2D tensor of shape (out_features, in_features)
+        Y is a 3D tensor of shape (batch_size, seq_length, out_features)
+        """
+        b, s, m = x.shape
+        n, wm = w.shape
+        assert m == wm, "x.shape[1] != w.shape[1]"
+        assert x.dtype == w.dtype, "x.dtype != w.dtype"
+        X = x.view(1, b * s, m)
+        W = w.T.unsqueeze(0)
+        graph = get_cudnn_matmul(
+            tuple(X.shape),
+            tuple(X.stride()),
+            tuple(W.shape),
+            tuple(W.stride()),
+            x.dtype,
+        )
+        y = graph(X, W).view(b, s, n)
+        return y
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Tensor, Tensor], output: Tuple[Tensor, Tensor]) -> None:
+        """Save tensors to help computing backward"""
+        x, w = inputs
+        ctx.save_for_backward(x, w)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+        """Backward for matmul function Y = X @ W.T:
+            dX = dY @ W.T
+            dW = X.T @ dY
+
+        grad_output = dY is a 3D tensor of shape (batch_size, seq_length, out_features)
+        X is a 3D tensor from ctx of shape (batch_size, seq_length, in_features)
+        W is a 2D tensor from ctx of shape (out_features, in_features)
+        """
+        x, w = ctx.saved_tensors
+        dx = dw = None
+        # collect shapes and check for consistency
+        xb, xs, xm = x.shape
+        wn, wm = w.shape
+        yb, ys, yn = grad_output.shape
+        assert yn == wn, "grad_output.shape[2] != w.shape[0]"
+        assert wm == xm, "x.shape[2] != w.shape[1]"
+        assert yb == xb, "grad_output.shape[0] != x.shape[0]"
+        assert ys == xs, "grad_output.shape[1] != x.shape[1]"
+        # optimize for efficiency: compute grad only when needed
+        # for compatibility with cuDNN, tensors need to be reshaped such that first dimension is 1
+        if ctx.needs_input_grad[0]:
+            # dx = grad_output @ w
+            dY = grad_output.view(1, yb * ys, yn)
+            W = w.unsqueeze(0)
+            graph = get_cudnn_matmul(
+                tuple(dY.shape),
+                tuple(dY.stride()),
+                tuple(W.shape),
+                tuple(W.stride()),
+                W.dtype,
+            )
+            dx = graph(dY, W).view(xb, xs, xm)
+        if ctx.needs_input_grad[1]:
+            # dw = grad_output.view(yb*ys, yn).T @ x.view(xb*xs, xm)
+            dY = grad_output.view(yb * ys, yn).T.unsqueeze(0)
+            X = x.view(1, xb * xs, xm)
+            graph = get_cudnn_matmul(
+                tuple(dY.shape),
+                tuple(dY.stride()),
+                tuple(X.shape),
+                tuple(X.stride()),
+                X.dtype,
+            )
+            dw = graph(dY, X).squeeze(0)
+        return dx, dw
+
+
+class Linear(nn.Module):
+    """Drop-in replacement for PyTorch nn.Linear module to use cuDNN matmul
+    For the use in LlamaModel, the input tensor x is in shape (batch_size, seq_length, in_features)
+    output tensor y should be in shape (batch_size, seq_length, out_features)
+    """
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super().__init__()
+        assert bias is False, "Requires bias=False in Llama"
+        # PyTorch Linear is y = x @ W.T with x in shape (batch_size, in_features) and y in shape (batch_size, out_features)
+        self.weight = nn.Parameter(torch.randn(out_features, in_features))
+
+    def __repr__(self) -> str:
+        out_features, in_features = self.weight.shape
+        bias = False
+        return f"Linear(in_features={in_features}, out_features={out_features}, bias={bias})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        return CudnnMatmul.apply(x, self.weight)
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_rmsnorm_fwd(batch_size: int, seq_len: int, hidden_dim: int, dtype: torch.dtype) -> cudnn.Graph:
+    """For use in the replacement of PyTorch nn.RMSNorm module. To compute RMS norm forward pass
+    with scale and epsilon
+    """
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["x", "scale", "epsilon"],
+        outputs=["out", "invvar"],
+    ) as graph:
+        x_gpu = graph.tensor(name="x", dim=(batch_size * seq_len, hidden_dim), stride=(hidden_dim, 1))
+        scale_gpu = graph.tensor(name="scale", dim=(1, hidden_dim), stride=(hidden_dim, 1))
+        eps_cpu = graph.tensor(
+            name="epsilon",
+            dim=(1, 1),
+            stride=(1, 1),
+            data_type=cudnn.data_type.FLOAT,
+            is_pass_by_value=True,
+        )
+        out, inv_var = graph.rmsnorm(
+            norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
+            input=x_gpu,
+            scale=scale_gpu,
+            epsilon=eps_cpu,
+        )
+        # set output, inv_var must be float32 tensor
+        out.set_output(True).set_name("out")
+        inv_var.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name("invvar")
+    return graph
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_rmsnorm_bwd(batch_size: int, seq_len: int, hidden_dim: int, dtype: torch.dtype) -> cudnn.Graph:
+    """For use in the replacement of PyTorch nn.RMSNorm module. To compute RMS norm backward pass
+    with scale
+    """
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["grad", "x", "invvar", "scale"],
+        outputs=["dx", "dscale"],
+    ) as graph:
+        grad_gpu = graph.tensor(name="grad", dim=(batch_size * seq_len, hidden_dim), stride=(hidden_dim, 1))
+        x_gpu = graph.tensor(name="x", dim=(batch_size * seq_len, hidden_dim), stride=(hidden_dim, 1))
+        invvar_gpu = graph.tensor(
+            name="invvar",
+            dim=(batch_size * seq_len, 1),
+            stride=(1, 1),
+            data_type=cudnn.data_type.FLOAT,
+        )
+        scale_gpu = graph.tensor(name="scale", dim=(1, hidden_dim), stride=(hidden_dim, 1))
+        dx, dscale, dbias = graph.rmsnorm_backward(
+            grad=grad_gpu,
+            input=x_gpu,
+            inv_variance=invvar_gpu,
+            scale=scale_gpu,
+            has_dbias=False,
+        )
+        # set outputs
+        dx.set_output(True).set_data_type(dtype).set_name("dx")
+        dscale.set_output(True).set_data_type(dtype).set_name("dscale")
+        assert dbias is None, "requested has_dbias=False, but dbias is not None"
+    return graph
+
+
+class CudnnRmsNorm(Function):
+    """Custom PyTorch RMS norm function using cuDNN for the use case of Llama model"""
+
+    @staticmethod
+    def forward(x: Tensor, scale: Tensor, eps: Tensor) -> Tuple[Tensor, Tensor]:
+        """RMS norm function: y = scale * x / sqrt(x^2 + eps)
+
+        x and y are 3D tensors of shape (batch_size, seq_length, hidden_dim)
+        scale is a 1D tensor of shape (hidden_dim,)
+        eps is a 2D tensor of shape (1,1) holding the epsilon value
+        """
+        b, s, h = x.shape
+        assert scale.shape == (h,), "scale.shape != (hidden_dim,)"
+        assert eps.shape == (1, 1), "eps.shape != (1,1)"
+        assert eps.stride() == (1, 1), "eps.stride() != (1, 1)"
+        assert x.dtype == scale.dtype, "x.dtype != scale.dtype"
+        X = x.view(b * s, h)
+        W = scale.unsqueeze(0)
+        assert X.device == W.device, "Both X and W should be on the same device"
+        assert X.stride() == (h, 1), "X.stride() != (hidden_dim, 1)"
+        assert W.stride() == (h, 1), "W.stride() != (hidden_dim, 1)"
+        graph = get_cudnn_rmsnorm_fwd(b, s, h, x.dtype)
+        y, inv_var = graph(X, W, eps)
+        return y.view(b, s, h), inv_var
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Tensor, Tensor, Tensor], output: Tuple[Tensor, Tensor]) -> None:
+        """Save tensors to help computing backward"""
+        x, scale, _eps = inputs
+        _y, inv_var = output
+        ctx.save_for_backward(x, scale, inv_var)
+
+    @staticmethod
+    def backward(ctx: Any, dy: Tensor, dinv_var: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """Backward for RMS norm function y = scale * x / sqrt(x^2 + eps):
+
+        grad_output (dy), x, y are 3D tensors of shape (batch_size, seq_length, hidden_dim)
+        scale is a 1D tensor from ctx of shape (hidden_dim,)
+        inv_var and dinv_var are 2D tensors of shape (batch_size*seq_length, 1)
+        """
+        x, scale, inv_var = ctx.saved_tensors
+        dx = dscale = deps = None
+        # collect shapes and check for consistency
+        xb, xs, xh = x.shape
+        wh = scale.shape[0]
+        yb, ys, yh = dy.shape
+        assert xh == wh, "x.shape[2] != w.shape[0]"
+        assert dy.shape == (xb, xs, yh), "dy.shape != x.shape"
+        assert inv_var.shape == (xb * xs, 1), "inv_var.shape != (batch_size*seq_length, 1)"
+        assert inv_var.stride() == (1, 1), "inv_var.stride() != (1, 1)"
+        # use cuDNN to compute all grads at once
+        # tensors need to be reshaped to 2D to be compatible with cuDNN
+        graph = get_cudnn_rmsnorm_bwd(xb, xs, xh, x.dtype)
+        dY = dy.view(yb * ys, yh)
+        assert dY.stride() == (yh, 1), "dY.stride() != (hidden_dim, 1)"
+        X = x.view(xb * xs, xh)
+        assert X.stride() == (xh, 1), "X.stride() != (hidden_dim, 1)"
+        W = scale.unsqueeze(0)
+        assert W.shape == (1, xh), "W.shape != (1, hidden_dim)"
+        assert W.stride() == (xh, 1), "W.stride() != (hidden_dim, 1)"
+        dx, dscale = graph(dY, X, inv_var, W)
+        dx = dx.view(xb, xs, xh)
+        dscale = dscale.squeeze(0)
+        return dx, dscale, deps
+
+
+class RMSNorm(nn.Module):
+    """Drop-in replacement for PyTorch nn.RMSNorm module to use cuDNN RMS norm
+    For the use in LlamaModel, the input and output tensors are in shape (batch_size, seq_length, hidden_dim)
+    """
+
+    def __init__(
+        self,
+        normalized_shape: int,
+        eps: float,
+        elementwise_affine: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+        assert eps is not None, "Requires eps in Llama"
+        assert elementwise_affine is True, "Requires elementwise_affine=True in Llama"
+        assert isinstance(normalized_shape, int), "normalized_shape must be an integer in Llama"
+        # PyTorch RMSNorm is y = scale * x / sqrt(x^2 + eps) with x and y in shape (batch_size, seq_length, hidden_dim)
+        # the scale tensor is a parameter of this module in shape (hidden_dim,)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.eps_cpu = torch.full((1, 1), eps, dtype=torch.float32, device="cpu")
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+
+    def __repr__(self) -> str:
+        normalized_shape = tuple(self.weight.shape)
+        eps = self.eps
+        elementwise_affine = self.elementwise_affine
+        return f"RMSNorm(normalized_shape={normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward for RMS norm function y = scale * x / sqrt(x^2 + eps)
+
+        While cuDNN computes the inv_var tensor, PyTorch RMSNorm does not. For
+        compatibility with PyTorch, we return the output tensor y only.
+        """
+        y, inv_var = CudnnRmsNorm.apply(x, self.weight, self.eps_cpu)
+        return y
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_gqa_fwd(
+    batch_size: int, seq_len: int, heads_q: int, heads_kv: int, dim: int, dtype: torch.dtype
+) -> cudnn.Graph:
+    """For use in the replacement of PyTorch GQA function. To compute GQA forward pass
+    with causal mask
+    """
+    attn_scale = float(dim) ** -0.5
+    q_dim = (batch_size, heads_q, seq_len, dim)
+    q_stride = (dim * seq_len * heads_q, dim, dim * heads_q, 1)
+    kv_dim = (batch_size, heads_kv, seq_len, dim)
+    kv_stride = (dim * seq_len * heads_kv, dim, dim * heads_kv, 1)
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["q", "k", "v"],
+        outputs=["out", "stats"],
+    ) as graph:
+        q_gpu = graph.tensor(name="q", dim=q_dim, stride=q_stride)
+        k_gpu = graph.tensor(name="k", dim=kv_dim, stride=kv_stride)
+        v_gpu = graph.tensor(name="v", dim=kv_dim, stride=kv_stride)
+        out, stats = graph.sdpa(
+            q=q_gpu,
+            k=k_gpu,
+            v=v_gpu,
+            attn_scale=attn_scale,
+            is_inference=False,
+            use_causal_mask=True,
+        )
+        # set output, inv_var must be float32 tensor
+        out.set_output(True).set_dim(q_dim).set_stride(q_stride).set_name("out")
+        stats.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name("stats")
+    return graph
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_gqa_bwd(
+    batch_size: int, seq_len: int, heads_q: int, heads_kv: int, dim: int, dtype: torch.dtype
+) -> cudnn.Graph:
+    """For use in the replacement of PyTorch GQA function. To compute GQA backward pass
+    with causal mask
+    """
+    attn_scale = float(dim) ** -0.5
+    q_dim = (batch_size, heads_q, seq_len, dim)
+    q_stride = (dim * seq_len * heads_q, dim, dim * heads_q, 1)
+    kv_dim = (batch_size, heads_kv, seq_len, dim)
+    kv_stride = (dim * seq_len * heads_kv, dim, dim * heads_kv, 1)
+    stats_dim = (batch_size, heads_q, seq_len, 1)
+    stats_stride = (heads_q * seq_len, seq_len, 1, 1)
+    dO_stride = (dim * seq_len * heads_q, dim, dim * heads_q, 1)
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["q", "k", "v", "o", "dO", "stats"],
+        outputs=["dQ", "dK", "dV"],
+    ) as graph:
+        q_gpu = graph.tensor(name="q", dim=q_dim, stride=q_stride)
+        k_gpu = graph.tensor(name="k", dim=kv_dim, stride=kv_stride)
+        v_gpu = graph.tensor(name="v", dim=kv_dim, stride=kv_stride)
+        o_gpu = graph.tensor(name="o", dim=q_dim, stride=q_stride)
+        dO_gpu = graph.tensor(name="dO", dim=q_dim, stride=dO_stride)
+        stats_gpu = graph.tensor(
+            name="stats",
+            dim=stats_dim,
+            stride=stats_stride,
+            data_type=cudnn.data_type.FLOAT,
+        )
+        dQ, dK, dV = graph.sdpa_backward(
+            q=q_gpu,
+            k=k_gpu,
+            v=v_gpu,
+            o=o_gpu,
+            dO=dO_gpu,
+            stats=stats_gpu,
+            attn_scale=attn_scale,
+            use_causal_mask=True,
+        )
+        # set output, inv_var must be float32 tensor
+        dQ.set_output(True).set_dim(q_dim).set_stride(q_stride).set_name("dQ")
+        dK.set_output(True).set_dim(kv_dim).set_stride(kv_stride).set_name("dK")
+        dV.set_output(True).set_dim(kv_dim).set_stride(kv_stride).set_name("dV")
+    return graph
+
+
+class CudnnGQA(Function):
+    """Custom PyTorch GQA function using cuDNN for the use case of Llama model"""
+
+    @staticmethod
+    def forward(q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
+        """GQA function: o = softmax(qk^T/sqrt(d)) @ v
+
+        q, k, v are 4D tensors of shape (batch_size, num_heads, seq_length, head_dim), with different head dimensions
+        """
+        bq, hq, sq, dq = q.shape
+        bk, hk, sk, dk = k.shape
+        bv, hv, sv, dv = v.shape
+        assert hq % hk == 0, "H_q must be a multiple of H_kv (GQA/MQA constraint)"
+        assert hv == hk, "H_v must be equal to H_kv"
+        assert dq == dk == dv, "All head dimensions must be equal"
+        assert bq == bk == bv, "All batch sizes must be equal"
+        assert q.dtype == k.dtype == v.dtype, "All input tensors must have the same dtype"
+        assert q.stride() == (sq * dq * hq, dq, dq * hq, 1), "q.stride() != (s*d*h, d, d*h, 1)"
+        assert k.stride() == (sk * dk * hk, dk, dk * hk, 1), "k.stride() != (s*d*h, d, d*h, 1)"
+        assert v.stride() == (sv * dv * hv, dv, dv * hv, 1), "v.stride() != (s*d*h, d, d*h, 1)"
+        graph = get_cudnn_gqa_fwd(bq, sq, hq, hk, dq, q.dtype)
+        o, stats = graph(q, k, v)
+        return o, stats
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Tensor, Tensor, Tensor], output: Tuple[Tensor, Tensor]) -> None:
+        """Save tensors to help computing backward"""
+        q, k, v = inputs
+        o, stats = output
+        ctx.save_for_backward(q, k, v, o, stats)
+
+    @staticmethod
+    def backward(ctx: Any, dO: Tensor, dstats: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """Backward for GQA function: o = softmax(qk^T/sqrt(d)) @ v
+
+        All tensors are 4D tensors of shape (batch_size, num_heads, seq_length, head_dim)
+        """
+        q, k, v, o, stats = ctx.saved_tensors
+        dq = dk = dv = None
+        # collect shapes and check for consistency
+        bq, hq, sq, dq = q.shape
+        bk, hk, sk, dk = k.shape
+        bv, hv, sv, dv = v.shape
+        bo, ho, so, do = o.shape
+        bs, hs, ss, ds = stats.shape
+        bdO, hdO, sdO, ddO = dO.shape
+        assert bq == bk == bv == bo == bs == bdO, "All batch sizes must be equal"
+        assert sq == so == ss == sdO, "Output and stats sequence lengths must match query sequence length"
+        assert hk == hv, "H_kv must be equal to H_kv"
+        assert sk == sv, "K and V sequence lengths must match"
+        assert hq == ho == hs == hdO, "Output and stats num heads must match num query heads"
+        assert ds == 1, "stats.shape[-1] != 1"
+        assert dq == dk == dv == do == ddO, "All head dimensions must be equal"
+        assert q.stride() == (sq * dq * hq, dq, dq * hq, 1), "q.stride() != (s*d*h, d, d*h, 1)"
+        assert k.stride() == (sk * dk * hk, dk, dk * hk, 1), "k.stride() != (s*d*h, d, d*h, 1)"
+        assert v.stride() == (sv * dv * hv, dv, dv * hv, 1), "v.stride() != (s*d*h, d, d*h, 1)"
+        assert o.stride() == (so * do * ho, do, do * ho, 1), "o.stride() != (s*d*h, d, d*h, 1)"
+        assert stats.stride() == (ss * hs, ss, 1, 1), "stats.stride() != (s*h, s, 1, 1)"
+        assert dO.stride() == (sdO * ddO * hdO, ddO, ddO * hdO, 1), "dO.stride() != (s*d*h, d, d*h, 1)"
+        assert q.dtype == k.dtype == v.dtype == o.dtype == dO.dtype, "All input/output tensors must have the same dtype"
+        # cuDNN compute all grads of GQA at once
+        graph = get_cudnn_gqa_bwd(bq, sq, hq, hk, dq, q.dtype)
+        dQ, dK, dV = graph(q, k, v, o, dO, stats)
+        return dQ, dK, dV
+
+
+@dataclasses.dataclass
+class LlamaConfig:
+    """Configuration class for LLaMA model hyperparameters.
+
+    This matches the configuration of LLaMA 3.1 8B model with:
+    - 32 transformer layers
+    - 4096 hidden dimension
+    - 32 attention heads
+    - Grouped-query attention with 8 key-value heads
+    """
+
+    vocab_size: int = 128256  # Size of the tokenizer vocabulary
+    max_position_embeddings: int = 131072  # Maximum sequence length
+    hidden_size: int = 4096  # Dimension of hidden layers
+    intermediate_size: int = 14336  # Dimension of MLP's hidden layer
+    num_hidden_layers: int = 32  # Number of transformer layers
+    num_attention_heads: int = 32  # Number of attention heads
+    num_key_value_heads: int = 8  # Number of key-value heads for GQA
+    rms_norm_eps: float = 1e-5  # Epsilon for RMSNorm
+
+
+def rotate_half(x: Tensor) -> Tensor:
+    """Rotates half the hidden dims of the input.
+
+    This is a helper function for rotary position embeddings (RoPE).
+    For a tensor of shape (..., d), it returns a tensor where the last
+    d/2 dimensions are rotated by swapping and negating.
+
+    Args:
+        x: Input tensor of shape (..., d)
+
+    Returns:
+        Tensor of same shape with rotated last dimension
+    """
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)  # Concatenate with rotation
+
+
+def apply_rotary_pos_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor:
+    """Apply rotary position embeddings to a tensor.
+
+    RoPE performs rotation in vector space based on position using
+    trigonometric functions. This allows the model to learn relative
+    positions in a more efficient way than absolute position embeddings.
+
+    Args:
+        x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        cos: Cosine position embeddings matching the shape of x
+        sin: Sine position embeddings matching the shape of x
+    """
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+def get_inv_freq(N: float, dim: int) -> Tensor:
+    """Get the inverse frequency for the RoPE with the Llama 3.1 scaling.
+    Always computed in float32
+
+    Args:
+        N: Base, a large number
+        dim: Size of hidden dimension, should be divisible by 2
+    """
+    N = float(N)
+    dim = int(dim)
+    # Llama 3.1 RoPE parameters
+    factor = 8.0
+    low_freq, high_freq = 1.0, 4.0
+    context_len = 8192
+    # Compute the inverse frequency based on the standard RoPE formula
+    inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2).float().to("cuda") / dim))
+    # Compute the modified inverse frequency, then derive the smoothed inverse frequency
+    wavelen = 2 * math.pi / inv_freq
+    max_wavelen = context_len / low_freq
+    min_wavelen = context_len / high_freq
+    inv_freq = torch.where(wavelen > max_wavelen, inv_freq / factor, inv_freq)
+    smooth_factor = (context_len / wavelen - low_freq) / (high_freq - low_freq)
+    smoothed = (1 - smooth_factor) * inv_freq / factor + smooth_factor * inv_freq
+    # Output inverse frequency as a mix of the two
+    is_medium_freq = ~(wavelen < min_wavelen) * ~(wavelen > max_wavelen)
+    inv_freq_final = torch.where(is_medium_freq, smoothed, inv_freq)
+    return inv_freq_final
+
+
+class RotaryPositionEncoding(nn.Module):
+    """Rotary position encoding."""
+
+    def __init__(self, dim: int, max_position_embeddings: int) -> None:
+        """Initialize the RotaryPositionEncoding module
+
+        Args:
+            dim: The hidden dimension of the input tensor to which RoPE is applied
+            max_position_embeddings: The maximum sequence length of the input tensor
+        """
+        super().__init__()
+        # compute a matrix of n\theta_i
+        N = 500_000.0
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.inv_freq = get_inv_freq(N, dim)
+        position = torch.arange(max_position_embeddings).float().to("cuda")
+        inv_freq = torch.cat((self.inv_freq, self.inv_freq), dim=-1)
+        sinusoid_inp = torch.outer(position, inv_freq)
+        # save cosine and sine matrices as buffers, not parameters
+        self.register_buffer("cos", sinusoid_inp.cos())
+        self.register_buffer("sin", sinusoid_inp.sin())
+
+    def __repr__(self) -> str:
+        return f"RotaryPositionEncoding(dim={self.dim}, max_position_embeddings={self.max_position_embeddings})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply RoPE to tensor x
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+
+        Returns:
+            Output tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        """
+        dtype = x.dtype
+        seq_len = x.shape[1]
+        # transform the cosine and sine matrices to 4D tensor and the same dtype as x
+        cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        # apply RoPE to x
+        return apply_rotary_pos_emb(x, cos, sin)
+
+
+class LlamaMLP(nn.Module):
+    """MLP layer with SwiGLU activation.
+
+    The architecture follows:
+    1. Project input to intermediate size through two parallel layers
+    2. Apply SwiGLU activation (multiply gate and up-projected inputs)
+    3. Project back to hidden size
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        # Two parallel projections for SwiGLU
+        self.gate_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        # Project back to hidden size
+        self.down_proj = Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = F.silu  # SwiGLU activation function
+
+    def forward(self, x: Tensor) -> Tensor:
+        # SwiGLU activation: multiply gate and up-projected inputs
+        gate = self.act_fn(self.gate_proj(x))
+        up = self.up_proj(x)
+        return self.down_proj(gate * up)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-head attention with grouped-query attention and rotary embeddings.
+
+    Grouped-query attention reduces computation by using fewer key-value heads
+    than query heads, then sharing the same key-value heads across multiple queries.
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError("hidden_size must be divisible by num_heads")
+
+        # Linear layers for Q, K, V projections
+        self.q_proj = Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        # Project inputs to Q, K, V
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+
+        # Apply rotary position embeddings
+        if rope is not None:
+            query_states = rope(query_states)
+            key_states = rope(key_states)
+
+        # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # Use cuDNN's optimized attention implementation with causal mask
+        attn_output, _stats = CudnnGQA.apply(query_states, key_states, value_states)
+
+        # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, and then project output
+        attn_output = attn_output.transpose(1, 2).view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+
+class LlamaDecoderLayer(nn.Module):
+    """Single transformer layer for LLaMA.
+
+    Architecture:
+    1. Input -> RMSNorm -> Self-Attention -> Residual
+    2. RMSNorm -> MLP -> Residual
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = LlamaAttention(config)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        # First residual block: Self-attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(hidden_states=hidden_states, rope=rope)
+        hidden_states = attn_outputs + residual
+
+        # Second residual block: MLP
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states) + residual
+        return hidden_states
+
+
+class LlamaModel(nn.Module):
+    """The full Llama model."""
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.rotary_emb = RotaryPositionEncoding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings,
+        )
+
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        # Stack of transformer layers
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final layer norm
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        output_hidden_states: bool = False,
+    ) -> Tuple[Tensor, Optional[Tuple[Tensor, ...]]]:
+        # Convert input token IDs to embeddings
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Initialize list to collect hidden states if requested
+        all_hidden_states = () if output_hidden_states else None
+
+        # Process through all transformer layers, accumulating hidden states as the input to each layer
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            hidden_states = layer(hidden_states, rope=self.rotary_emb)
+
+        # Final layer norm, accumulate as the final hidden state
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # Return tuple of the output and the list of hidden states from all layers
+        return [hidden_states, all_hidden_states]
+
+
+# Create model with default config
+test_config = LlamaConfig()
+torch.set_default_device("cuda")
+model = LlamaModel(test_config).to(torch.bfloat16)
+print(time.time(), "model created")
+state_dict = torch.load("llama3.1_8b_weights.bf16.pt", map_location="cuda")
+print(time.time(), "state_dict loaded from disk")
+model.load_state_dict(state_dict, strict=False)
+print(time.time(), "model loaded")
+del state_dict
+print(model)
+print()
+
+# load sample input and output tensors from 101_hf_llama_tieout.py
+tensors = torch.load("tensors-bf16-tieout.pt", map_location="cuda")
+x, rope_ref, inv_freq_ref, y_ref, hidden_states_ref, target, grad_embed_ref, grad_norm_ref = tensors
+
+# trial run one forward & backward pass
+epoch = time.time()
+y, hidden_states = model.forward(x, output_hidden_states=True)
+print(time.time(), f"forward pass finished in {time.time() - epoch:.5f} sec")
+criterion = torch.nn.MSELoss()
+assert y.shape == target.shape, f"y.shape={y.shape} not the same as target.shpae={target.shape}"
+loss = criterion(y, target)
+epoch = time.time()
+loss.backward()
+print(time.time(), f"backward pass finished in {time.time() - epoch:.5f} sec")
+grad_embed = model.embed_tokens.weight.grad
+grad_norm = model.norm.weight.grad
+
+# compare results
+x_embed = model.embed_tokens(x)
+inv_freq = model.rotary_emb.inv_freq.type_as(inv_freq_ref)
+x_rope = (
+    model.rotary_emb.cos[: x.shape[1]].unsqueeze(0),
+    model.rotary_emb.sin[: x.shape[1]].unsqueeze(0),
+)
+
+print()
+print("Numerical difference compared to reference implementation:")
+print("RoPE cosine:", report_close(x_rope[0], rope_ref[0], atol=1e-3, rtol=1e-3))
+print("RoPE sine:", report_close(x_rope[1], rope_ref[1], atol=1e-3, rtol=1e-3))
+print("inv_freq:", report_close(inv_freq, inv_freq_ref, atol=0, rtol=0))
+print()
+for i in range(len(hidden_states)):
+    print(
+        f"output of layer {i}:",
+        report_close(hidden_states[i], hidden_states_ref[i], atol=1e-2, rtol=1e-2),
+    )
+print()
+print("final norm grad:", report_close(grad_norm, grad_norm_ref, atol=1e-2, rtol=1e-2))
+print("embed grad:", report_close(grad_embed, grad_embed_ref, atol=1e-2, rtol=1e-2))
diff --git a/third_party/cudnn-frontend/samples/llama/104_torch_llama_nvtx.py b/third_party/cudnn-frontend/samples/llama/104_torch_llama_nvtx.py
new file mode 100644
index 00000000..15b7f96e
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/104_torch_llama_nvtx.py
@@ -0,0 +1,370 @@
+"""Profile a concise PyTorch implementation of the Llama 3.1 8B model. The model
+accepts the LlamaModel model weight from Hugging Face, and profiles the forward
+and backward passes using NVTX.
+
+To run this script with profiler:
+
+nsys profile -f true --gpu-metrics-devices=cuda-visible --export=sqlite -o $OUTPUTFILE.nsys-rep \
+    python $SCRIPT_PATH
+"""
+
+import dataclasses
+import math
+import time
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Linear, RMSNorm
+from torch.nn.attention import SDPBackend, sdpa_kernel
+
+
+# For type annotations
+Tensor = torch.Tensor
+
+
+@dataclasses.dataclass
+class LlamaConfig:
+    """Configuration class for LLaMA model hyperparameters.
+
+    This matches the configuration of LLaMA 3.1 8B model with:
+    - 32 transformer layers
+    - 4096 hidden dimension
+    - 32 attention heads
+    - Grouped-query attention with 8 key-value heads
+    """
+
+    vocab_size: int = 128256  # Size of the tokenizer vocabulary
+    max_position_embeddings: int = 131072  # Maximum sequence length
+    hidden_size: int = 4096  # Dimension of hidden layers
+    intermediate_size: int = 14336  # Dimension of MLP's hidden layer
+    num_hidden_layers: int = 32  # Number of transformer layers
+    num_attention_heads: int = 32  # Number of attention heads
+    num_key_value_heads: int = 8  # Number of key-value heads for GQA
+    rms_norm_eps: float = 1e-5  # Epsilon for RMSNorm
+
+
+def rotate_half(x: Tensor) -> Tensor:
+    """Rotates half the hidden dims of the input.
+
+    This is a helper function for rotary position embeddings (RoPE).
+    For a tensor of shape (..., d), it returns a tensor where the last
+    d/2 dimensions are rotated by swapping and negating.
+
+    Args:
+        x: Input tensor of shape (..., d)
+
+    Returns:
+        Tensor of same shape with rotated last dimension
+    """
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)  # Concatenate with rotation
+
+
+def apply_rotary_pos_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor:
+    """Apply rotary position embeddings to a tensor.
+
+    RoPE performs rotation in vector space based on position using
+    trigonometric functions. This allows the model to learn relative
+    positions in a more efficient way than absolute position embeddings.
+
+    Args:
+        x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        cos: Cosine position embeddings matching the shape of x
+        sin: Sine position embeddings matching the shape of x
+    """
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+def get_inv_freq(N: float, dim: int) -> Tensor:
+    """Get the inverse frequency for the RoPE with the Llama 3.1 scaling.
+    Always computed in float32
+
+    Args:
+        N: Base, a large number
+        dim: Size of hidden dimension, should be divisible by 2
+    """
+    N = float(N)
+    dim = int(dim)
+    # Llama 3.1 RoPE parameters
+    factor = 8.0
+    low_freq, high_freq = 1.0, 4.0
+    context_len = 8192
+    # Compute the inverse frequency based on the standard RoPE formula
+    inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2).float().to("cuda") / dim))
+    # Compute the modified inverse frequency, then derive the smoothed inverse frequency
+    wavelen = 2 * math.pi / inv_freq
+    max_wavelen = context_len / low_freq
+    min_wavelen = context_len / high_freq
+    inv_freq = torch.where(wavelen > max_wavelen, inv_freq / factor, inv_freq)
+    smooth_factor = (context_len / wavelen - low_freq) / (high_freq - low_freq)
+    smoothed = (1 - smooth_factor) * inv_freq / factor + smooth_factor * inv_freq
+    # Output inverse frequency as a mix of the two
+    is_medium_freq = ~(wavelen < min_wavelen) * ~(wavelen > max_wavelen)
+    inv_freq_final = torch.where(is_medium_freq, smoothed, inv_freq)
+    return inv_freq_final
+
+
+class RotaryPositionEncoding(nn.Module):
+    """Rotary position encoding."""
+
+    def __init__(self, dim: int, max_position_embeddings: int) -> None:
+        """Initialize the RotaryPositionEncoding module
+
+        Args:
+            dim: The hidden dimension of the input tensor to which RoPE is applied
+            max_position_embeddings: The maximum sequence length of the input tensor
+        """
+        super().__init__()
+        # compute a matrix of n\theta_i
+        N = 500_000.0
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.inv_freq = get_inv_freq(N, dim)
+        position = torch.arange(max_position_embeddings).float().to("cuda")
+        inv_freq = torch.cat((self.inv_freq, self.inv_freq), dim=-1)
+        sinusoid_inp = torch.outer(position, inv_freq)
+        # save cosine and sine matrices as buffers, not parameters
+        self.register_buffer("cos", sinusoid_inp.cos())
+        self.register_buffer("sin", sinusoid_inp.sin())
+
+    def __repr__(self) -> str:
+        return f"RotaryPositionEncoding(dim={self.dim}, max_position_embeddings={self.max_position_embeddings})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply RoPE to tensor x
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+
+        Returns:
+            Output tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        """
+        dtype = x.dtype
+        seq_len = x.shape[1]
+        # transform the cosine and sine matrices to 4D tensor and the same dtype as x
+        cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        # apply RoPE to x
+        return apply_rotary_pos_emb(x, cos, sin)
+
+
+class LlamaMLP(nn.Module):
+    """MLP layer with SwiGLU activation.
+
+    The architecture follows:
+    1. Project input to intermediate size through two parallel layers
+    2. Apply SwiGLU activation (multiply gate and up-projected inputs)
+    3. Project back to hidden size
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        # Two parallel projections for SwiGLU
+        self.gate_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        # Project back to hidden size
+        self.down_proj = Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = F.silu  # SwiGLU activation function
+
+    def forward(self, x: Tensor) -> Tensor:
+        # SwiGLU activation: multiply gate and up-projected inputs
+        gate = self.act_fn(self.gate_proj(x))
+        up = self.up_proj(x)
+        return self.down_proj(gate * up)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-head attention with grouped-query attention and rotary embeddings.
+
+    Grouped-query attention reduces computation by using fewer key-value heads
+    than query heads, then sharing the same key-value heads across multiple queries.
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError("hidden_size must be divisible by num_heads")
+
+        # Linear layers for Q, K, V projections
+        self.q_proj = Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        # Project inputs to Q, K, V
+        torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.qkv_proj")
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+        torch.cuda.nvtx.range_pop()
+
+        # Apply rotary position embeddings
+        if rope is not None:
+            torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.rope")
+            query_states = rope(query_states)
+            key_states = rope(key_states)
+            torch.cuda.nvtx.range_pop()
+
+        # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # Use PyTorch's optimized attention implementation
+        # setting is_causal=True is incompatible with setting explicit attention mask
+        torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.gqa")
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            dropout_p=0.0,
+            is_causal=True,
+            enable_gqa=True,
+        )
+        torch.cuda.nvtx.range_pop()
+
+        # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, and then project output
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.o_proj")
+        attn_output = self.o_proj(attn_output)
+        torch.cuda.nvtx.range_pop()
+        return attn_output
+
+
+class LlamaDecoderLayer(nn.Module):
+    """Single transformer layer for LLaMA.
+
+    Architecture:
+    1. Input -> RMSNorm -> Self-Attention -> Residual
+    2. RMSNorm -> MLP -> Residual
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = LlamaAttention(config)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        # First residual block: Self-attention
+        with torch.cuda.nvtx.range(":fwd.layer.attn"):
+            residual = hidden_states
+            with torch.cuda.nvtx.range(":fwd.layer.attn.prenorm"):
+                hidden_states = self.input_layernorm(hidden_states)
+            with torch.cuda.nvtx.range(":fwd.layer.attn.attn"):
+                attn_outputs = self.self_attn(hidden_states=hidden_states, rope=rope)
+            hidden_states = attn_outputs + residual
+
+        # Second residual block: MLP
+        with torch.cuda.nvtx.range(":fwd.layer.mlp"):
+            residual = hidden_states
+            with torch.cuda.nvtx.range(":fwd.layer.mlp.prenorm"):
+                hidden_states = self.post_attention_layernorm(hidden_states)
+            with torch.cuda.nvtx.range(":fwd.layer.mlp.mlp"):
+                hidden_states = self.mlp(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class LlamaModel(nn.Module):
+    """The full Llama model."""
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.rotary_emb = RotaryPositionEncoding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings,
+        )
+
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        # Stack of transformer layers
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final layer norm
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        output_hidden_states: bool = False,
+    ) -> Tuple[Tensor, Optional[Tuple[Tensor, ...]]]:
+        # Convert input token IDs to embeddings
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Initialize list to collect hidden states if requested
+        all_hidden_states = () if output_hidden_states else None
+
+        # Process through all transformer layers, accumulating hidden states as the input to each layer
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            with torch.cuda.nvtx.range(":fwd.layer"):
+                hidden_states = layer(hidden_states, rope=self.rotary_emb)
+
+        # Final layer norm, accumulate as the final hidden state
+        with torch.cuda.nvtx.range(":fwd.output_norm"):
+            hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # Return tuple of the output and the list of hidden states from all layers
+        return [hidden_states, all_hidden_states]
+
+
+# Create model with default config
+test_config = LlamaConfig()
+torch.set_default_device("cuda")
+model = LlamaModel(test_config).to(torch.bfloat16)
+print(time.time(), "model created")
+state_dict = torch.load("llama3.1_8b_weights.bf16.pt", map_location="cuda")
+print(time.time(), "state_dict loaded from disk")
+model.load_state_dict(state_dict, strict=False)
+print(time.time(), "model loaded")
+del state_dict
+print()
+
+# Warm up once the forward pass and backward pass
+BS, SEQ_LEN = 3, 10
+x = torch.randint(0, test_config.vocab_size, (BS, SEQ_LEN), device="cuda", dtype=torch.int64)
+y, hidden_states = model.forward(x, output_hidden_states=True)
+target = torch.randn_like(y)
+criterion = torch.nn.MSELoss()
+loss = criterion(y, target)
+loss.backward(retain_graph=True)  # warm up the cache
+torch.cuda.synchronize()
+print(time.time(), "Starting profiling")
+
+# Profile the forward pass and backward pass
+# SDPA backend: set to any of SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.CUDNN_ATTENTION, and SDPBackend.EFFICIENT_ATTENTION
+with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]), torch.autograd.profiler.emit_nvtx():
+    with torch.cuda.nvtx.range(":forward_backward"):
+        with torch.cuda.nvtx.range(":fwd"):
+            y, hidden_states = model.forward(x, output_hidden_states=True)
+            torch.cuda.synchronize()
+        loss = criterion(y, target)
+        with torch.cuda.nvtx.range(":bwd"):
+            loss.backward(retain_graph=True)  # warm up the cache
+            torch.cuda.synchronize()
+
+print(time.time(), "Profiling completed")
diff --git a/third_party/cudnn-frontend/samples/llama/105_cudnn_llama_nvtx.py b/third_party/cudnn-frontend/samples/llama/105_cudnn_llama_nvtx.py
new file mode 100644
index 00000000..8b041cd6
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/105_cudnn_llama_nvtx.py
@@ -0,0 +1,821 @@
+"""Profile a concise PyTorch implementation of the Llama 3.1 8B model with cuDNN optimizations.
+This implementation accept the LlamaModel model weight from Hugging Face, and profiles the forward
+and backward passes using NVTX.
+
+To run this script with profiler:
+
+nsys profile -f true --gpu-metrics-devices=cuda-visible --export=sqlite -o $OUTPUTFILE.nsys-rep \
+    python $SCRIPT_PATH
+"""
+
+import dataclasses
+import functools
+import math
+import time
+from typing import Any, Optional, Tuple
+
+import cudnn
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# from torch.nn import Linear, RMSNorm   # Commented out. Using custom class as drop-in replacement
+from torch.autograd.function import Function
+
+
+# For type annotations
+Tensor = torch.Tensor
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_matmul(
+    x_dim: Tuple[int, ...],
+    x_stride: Tuple[int, ...],
+    w_dim: Tuple[int, ...],
+    w_stride: Tuple[int, ...],
+    dtype: torch.dtype,
+) -> cudnn.Graph:
+    """For use in the replacement of PyTorch nn.Linear module. To compute Y = X @ W.
+    Create a cuDNN graph for matmul between a tensor "x" of shape (1, m, n) and a
+    tensor "w" of shape (1, n, k) to produce a tensor "y" of shape (1, m, k).
+    """
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        compute_data_type=dtype,
+        inputs=["X", "W"],
+        outputs=["Y"],
+    ) as graph:
+        X = graph.tensor(name="X", dim=x_dim, stride=x_stride)
+        W = graph.tensor(name="W", dim=w_dim, stride=w_stride)
+        Y = graph.matmul(name="mm", A=X, B=W)
+        Y.set_output(True).set_name("Y")
+    return graph
+
+
+class CudnnMatmul(Function):
+    """Custom PyTorch matmul function using cuDNN for the use case of Llama model"""
+
+    @staticmethod
+    def forward(x: Tensor, w: Tensor) -> Tensor:
+        """Matmul function: Y = X @ W.T
+
+        X is a 3D tensor of shape (batch_size, seq_length, in_features)
+        W is a 2D tensor of shape (out_features, in_features)
+        Y is a 3D tensor of shape (batch_size, seq_length, out_features)
+        """
+        b, s, m = x.shape
+        n, wm = w.shape
+        assert m == wm, "x.shape[1] != w.shape[1]"
+        assert x.dtype == w.dtype, "x.dtype != w.dtype"
+        X = x.view(1, b * s, m)
+        W = w.T.unsqueeze(0)
+        graph = get_cudnn_matmul(
+            tuple(X.shape),
+            tuple(X.stride()),
+            tuple(W.shape),
+            tuple(W.stride()),
+            x.dtype,
+        )
+        y = graph(X, W).view(b, s, n)
+        return y
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Tensor, Tensor], output: Tuple[Tensor, Tensor]) -> None:
+        """Save tensors to help computing backward"""
+        x, w = inputs
+        ctx.save_for_backward(x, w)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+        """Backward for matmul function Y = X @ W.T:
+            dX = dY @ W.T
+            dW = X.T @ dY
+
+        grad_output = dY is a 3D tensor of shape (batch_size, seq_length, out_features)
+        X is a 3D tensor from ctx of shape (batch_size, seq_length, in_features)
+        W is a 2D tensor from ctx of shape (out_features, in_features)
+        """
+        x, w = ctx.saved_tensors
+        dx = dw = None
+        # collect shapes and check for consistency
+        xb, xs, xm = x.shape
+        wn, wm = w.shape
+        yb, ys, yn = grad_output.shape
+        assert yn == wn, "grad_output.shape[2] != w.shape[0]"
+        assert wm == xm, "x.shape[2] != w.shape[1]"
+        assert yb == xb, "grad_output.shape[0] != x.shape[0]"
+        assert ys == xs, "grad_output.shape[1] != x.shape[1]"
+        # optimize for efficiency: compute grad only when needed
+        # for compatibility with cuDNN, tensors need to be reshaped such that first dimension is 1
+        if ctx.needs_input_grad[0]:
+            # dx = grad_output @ w
+            dY = grad_output.view(1, yb * ys, yn)
+            W = w.unsqueeze(0)
+            graph = get_cudnn_matmul(
+                tuple(dY.shape),
+                tuple(dY.stride()),
+                tuple(W.shape),
+                tuple(W.stride()),
+                W.dtype,
+            )
+            dx = graph(dY, W).view(xb, xs, xm)
+        if ctx.needs_input_grad[1]:
+            # dw = grad_output.view(yb*ys, yn).T @ x.view(xb*xs, xm)
+            dY = grad_output.view(yb * ys, yn).T.unsqueeze(0)
+            X = x.view(1, xb * xs, xm)
+            graph = get_cudnn_matmul(
+                tuple(dY.shape),
+                tuple(dY.stride()),
+                tuple(X.shape),
+                tuple(X.stride()),
+                X.dtype,
+            )
+            dw = graph(dY, X).squeeze(0)
+        return dx, dw
+
+
+class Linear(nn.Module):
+    """Drop-in replacement for PyTorch nn.Linear module to use cuDNN matmul
+    For the use in LlamaModel, the input tensor x is in shape (batch_size, seq_length, in_features)
+    output tensor y should be in shape (batch_size, seq_length, out_features)
+    """
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super().__init__()
+        assert bias is False, "Requires bias=False in Llama"
+        # PyTorch Linear is y = x @ W.T with x in shape (batch_size, in_features) and y in shape (batch_size, out_features)
+        self.weight = nn.Parameter(torch.randn(out_features, in_features))
+
+    def __repr__(self) -> str:
+        out_features, in_features = self.weight.shape
+        bias = False
+        return f"Linear(in_features={in_features}, out_features={out_features}, bias={bias})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        return CudnnMatmul.apply(x, self.weight)
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_rmsnorm_fwd(batch_size: int, seq_len: int, hidden_dim: int, dtype: torch.dtype) -> cudnn.Graph:
+    """For use in the replacement of PyTorch nn.RMSNorm module. To compute RMS norm forward pass
+    with scale and epsilon
+    """
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["x", "scale", "epsilon"],
+        outputs=["out", "invvar"],
+    ) as graph:
+        x_gpu = graph.tensor(name="x", dim=(batch_size * seq_len, hidden_dim), stride=(hidden_dim, 1))
+        scale_gpu = graph.tensor(name="scale", dim=(1, hidden_dim), stride=(hidden_dim, 1))
+        eps_cpu = graph.tensor(
+            name="epsilon",
+            dim=(1, 1),
+            stride=(1, 1),
+            data_type=cudnn.data_type.FLOAT,
+            is_pass_by_value=True,
+        )
+        out, inv_var = graph.rmsnorm(
+            norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
+            input=x_gpu,
+            scale=scale_gpu,
+            epsilon=eps_cpu,
+        )
+        # set output, inv_var must be float32 tensor
+        out.set_output(True).set_name("out")
+        inv_var.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name("invvar")
+    return graph
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_rmsnorm_bwd(batch_size: int, seq_len: int, hidden_dim: int, dtype: torch.dtype) -> cudnn.Graph:
+    """For use in the replacement of PyTorch nn.RMSNorm module. To compute RMS norm backward pass
+    with scale
+    """
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["grad", "x", "invvar", "scale"],
+        outputs=["dx", "dscale"],
+    ) as graph:
+        grad_gpu = graph.tensor(name="grad", dim=(batch_size * seq_len, hidden_dim), stride=(hidden_dim, 1))
+        x_gpu = graph.tensor(name="x", dim=(batch_size * seq_len, hidden_dim), stride=(hidden_dim, 1))
+        invvar_gpu = graph.tensor(
+            name="invvar",
+            dim=(batch_size * seq_len, 1),
+            stride=(1, 1),
+            data_type=cudnn.data_type.FLOAT,
+        )
+        scale_gpu = graph.tensor(name="scale", dim=(1, hidden_dim), stride=(hidden_dim, 1))
+        dx, dscale, dbias = graph.rmsnorm_backward(
+            grad=grad_gpu,
+            input=x_gpu,
+            inv_variance=invvar_gpu,
+            scale=scale_gpu,
+            has_dbias=False,
+        )
+        # set outputs
+        dx.set_output(True).set_data_type(dtype).set_name("dx")
+        dscale.set_output(True).set_data_type(dtype).set_name("dscale")
+        assert dbias is None, "requested has_dbias=False, but dbias is not None"
+    return graph
+
+
+class CudnnRmsNorm(Function):
+    """Custom PyTorch RMS norm function using cuDNN for the use case of Llama model"""
+
+    @staticmethod
+    def forward(x: Tensor, scale: Tensor, eps: Tensor) -> Tuple[Tensor, Tensor]:
+        """RMS norm function: y = scale * x / sqrt(x^2 + eps)
+
+        x and y are 3D tensors of shape (batch_size, seq_length, hidden_dim)
+        scale is a 1D tensor of shape (hidden_dim,)
+        eps is a 2D tensor of shape (1,1) holding the epsilon value
+        """
+        b, s, h = x.shape
+        assert scale.shape == (h,), "scale.shape != (hidden_dim,)"
+        assert eps.shape == (1, 1), "eps.shape != (1,1)"
+        assert eps.stride() == (1, 1), "eps.stride() != (1, 1)"
+        assert x.dtype == scale.dtype, "x.dtype != scale.dtype"
+        X = x.view(b * s, h)
+        W = scale.unsqueeze(0)
+        assert X.device == W.device, "Both X and W should be on the same device"
+        assert X.stride() == (h, 1), "X.stride() != (hidden_dim, 1)"
+        assert W.stride() == (h, 1), "W.stride() != (hidden_dim, 1)"
+        graph = get_cudnn_rmsnorm_fwd(b, s, h, x.dtype)
+        y, inv_var = graph(X, W, eps)
+        return y.view(b, s, h), inv_var
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Tensor, Tensor, Tensor], output: Tuple[Tensor, Tensor]) -> None:
+        """Save tensors to help computing backward"""
+        x, scale, _eps = inputs
+        _y, inv_var = output
+        ctx.save_for_backward(x, scale, inv_var)
+
+    @staticmethod
+    def backward(ctx: Any, dy: Tensor, dinv_var: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """Backward for RMS norm function y = scale * x / sqrt(x^2 + eps):
+
+        grad_output (dy), x, y are 3D tensors of shape (batch_size, seq_length, hidden_dim)
+        scale is a 1D tensor from ctx of shape (hidden_dim,)
+        inv_var and dinv_var are 2D tensors of shape (batch_size*seq_length, 1)
+        """
+        x, scale, inv_var = ctx.saved_tensors
+        dx = dscale = deps = None
+        # collect shapes and check for consistency
+        xb, xs, xh = x.shape
+        wh = scale.shape[0]
+        yb, ys, yh = dy.shape
+        assert xh == wh, "x.shape[2] != w.shape[0]"
+        assert dy.shape == (xb, xs, yh), "dy.shape != x.shape"
+        assert inv_var.shape == (xb * xs, 1), "inv_var.shape != (batch_size*seq_length, 1)"
+        assert inv_var.stride() == (1, 1), "inv_var.stride() != (1, 1)"
+        # use cuDNN to compute all grads at once
+        # tensors need to be reshaped to 2D to be compatible with cuDNN
+        graph = get_cudnn_rmsnorm_bwd(xb, xs, xh, x.dtype)
+        dY = dy.view(yb * ys, yh)
+        assert dY.stride() == (yh, 1), "dY.stride() != (hidden_dim, 1)"
+        X = x.view(xb * xs, xh)
+        assert X.stride() == (xh, 1), "X.stride() != (hidden_dim, 1)"
+        W = scale.unsqueeze(0)
+        assert W.shape == (1, xh), "W.shape != (1, hidden_dim)"
+        assert W.stride() == (xh, 1), "W.stride() != (hidden_dim, 1)"
+        dx, dscale = graph(dY, X, inv_var, W)
+        dx = dx.view(xb, xs, xh)
+        dscale = dscale.squeeze(0)
+        return dx, dscale, deps
+
+
+class RMSNorm(nn.Module):
+    """Drop-in replacement for PyTorch nn.RMSNorm module to use cuDNN RMS norm
+    For the use in LlamaModel, the input and output tensors are in shape (batch_size, seq_length, hidden_dim)
+    """
+
+    def __init__(
+        self,
+        normalized_shape: int,
+        eps: float,
+        elementwise_affine: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        super().__init__()
+        assert eps is not None, "Requires eps in Llama"
+        assert elementwise_affine is True, "Requires elementwise_affine=True in Llama"
+        assert isinstance(normalized_shape, int), "normalized_shape must be an integer in Llama"
+        # PyTorch RMSNorm is y = scale * x / sqrt(x^2 + eps) with x and y in shape (batch_size, seq_length, hidden_dim)
+        # the scale tensor is a parameter of this module in shape (hidden_dim,)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        self.eps_cpu = torch.full((1, 1), eps, dtype=torch.float32, device="cpu")
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+
+    def __repr__(self) -> str:
+        normalized_shape = tuple(self.weight.shape)
+        eps = self.eps
+        elementwise_affine = self.elementwise_affine
+        return f"RMSNorm(normalized_shape={normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward for RMS norm function y = scale * x / sqrt(x^2 + eps)
+
+        While cuDNN computes the inv_var tensor, PyTorch RMSNorm does not. For
+        compatibility with PyTorch, we return the output tensor y only.
+        """
+        y, inv_var = CudnnRmsNorm.apply(x, self.weight, self.eps_cpu)
+        return y
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_gqa_fwd(
+    batch_size: int, seq_len: int, heads_q: int, heads_kv: int, dim: int, dtype: torch.dtype
+) -> cudnn.Graph:
+    """For use in the replacement of PyTorch GQA function. To compute GQA forward pass
+    with causal mask
+    """
+    attn_scale = float(dim) ** -0.5
+    q_dim = (batch_size, heads_q, seq_len, dim)
+    q_stride = (dim * seq_len * heads_q, dim, dim * heads_q, 1)
+    kv_dim = (batch_size, heads_kv, seq_len, dim)
+    kv_stride = (dim * seq_len * heads_kv, dim, dim * heads_kv, 1)
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["q", "k", "v"],
+        outputs=["out", "stats"],
+    ) as graph:
+        q_gpu = graph.tensor(name="q", dim=q_dim, stride=q_stride)
+        k_gpu = graph.tensor(name="k", dim=kv_dim, stride=kv_stride)
+        v_gpu = graph.tensor(name="v", dim=kv_dim, stride=kv_stride)
+        out, stats = graph.sdpa(
+            q=q_gpu,
+            k=k_gpu,
+            v=v_gpu,
+            attn_scale=attn_scale,
+            is_inference=False,
+            use_causal_mask=True,
+        )
+        # set output, inv_var must be float32 tensor
+        out.set_output(True).set_dim(q_dim).set_stride(q_stride).set_name("out")
+        stats.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name("stats")
+    return graph
+
+
+@functools.lru_cache(maxsize=None)
+def get_cudnn_gqa_bwd(
+    batch_size: int, seq_len: int, heads_q: int, heads_kv: int, dim: int, dtype: torch.dtype
+) -> cudnn.Graph:
+    """For use in the replacement of PyTorch GQA function. To compute GQA backward pass
+    with causal mask
+    """
+    attn_scale = float(dim) ** -0.5
+    q_dim = (batch_size, heads_q, seq_len, dim)
+    q_stride = (dim * seq_len * heads_q, dim, dim * heads_q, 1)
+    kv_dim = (batch_size, heads_kv, seq_len, dim)
+    kv_stride = (dim * seq_len * heads_kv, dim, dim * heads_kv, 1)
+    stats_dim = (batch_size, heads_q, seq_len, 1)
+    stats_stride = (heads_q * seq_len, seq_len, 1, 1)
+    dO_stride = (dim * seq_len * heads_q, dim, dim * heads_q, 1)
+    with cudnn.Graph(
+        handle="auto",
+        io_data_type=dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        inputs=["q", "k", "v", "o", "dO", "stats"],
+        outputs=["dQ", "dK", "dV"],
+    ) as graph:
+        q_gpu = graph.tensor(name="q", dim=q_dim, stride=q_stride)
+        k_gpu = graph.tensor(name="k", dim=kv_dim, stride=kv_stride)
+        v_gpu = graph.tensor(name="v", dim=kv_dim, stride=kv_stride)
+        o_gpu = graph.tensor(name="o", dim=q_dim, stride=q_stride)
+        dO_gpu = graph.tensor(name="dO", dim=q_dim, stride=dO_stride)
+        stats_gpu = graph.tensor(
+            name="stats",
+            dim=stats_dim,
+            stride=stats_stride,
+            data_type=cudnn.data_type.FLOAT,
+        )
+        dQ, dK, dV = graph.sdpa_backward(
+            q=q_gpu,
+            k=k_gpu,
+            v=v_gpu,
+            o=o_gpu,
+            dO=dO_gpu,
+            stats=stats_gpu,
+            attn_scale=attn_scale,
+            use_causal_mask=True,
+        )
+        # set output, inv_var must be float32 tensor
+        dQ.set_output(True).set_dim(q_dim).set_stride(q_stride).set_name("dQ")
+        dK.set_output(True).set_dim(kv_dim).set_stride(kv_stride).set_name("dK")
+        dV.set_output(True).set_dim(kv_dim).set_stride(kv_stride).set_name("dV")
+    return graph
+
+
+class CudnnGQA(Function):
+    """Custom PyTorch GQA function using cuDNN for the use case of Llama model"""
+
+    @staticmethod
+    def forward(q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
+        """GQA function: o = softmax(qk^T/sqrt(d)) @ v
+
+        q, k, v are 4D tensors of shape (batch_size, num_heads, seq_length, head_dim), with different head dimensions
+        """
+        bq, hq, sq, dq = q.shape
+        bk, hk, sk, dk = k.shape
+        bv, hv, sv, dv = v.shape
+        assert hq % hk == 0, "H_q must be a multiple of H_kv (GQA/MQA constraint)"
+        assert hv == hk, "H_v must be equal to H_kv"
+        assert dq == dk == dv, "All head dimensions must be equal"
+        assert bq == bk == bv, "All batch sizes must be equal"
+        assert q.dtype == k.dtype == v.dtype, "All input tensors must have the same dtype"
+        assert q.stride() == (sq * dq * hq, dq, dq * hq, 1), "q.stride() != (s*d*h, d, d*h, 1)"
+        assert k.stride() == (sk * dk * hk, dk, dk * hk, 1), "k.stride() != (s*d*h, d, d*h, 1)"
+        assert v.stride() == (sv * dv * hv, dv, dv * hv, 1), "v.stride() != (s*d*h, d, d*h, 1)"
+        graph = get_cudnn_gqa_fwd(bq, sq, hq, hk, dq, q.dtype)
+        o, stats = graph(q, k, v)
+        return o, stats
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Tensor, Tensor, Tensor], output: Tuple[Tensor, Tensor]) -> None:
+        """Save tensors to help computing backward"""
+        q, k, v = inputs
+        o, stats = output
+        ctx.save_for_backward(q, k, v, o, stats)
+
+    @staticmethod
+    def backward(ctx: Any, dO: Tensor, dstats: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """Backward for GQA function: o = softmax(qk^T/sqrt(d)) @ v
+
+        All tensors are 4D tensors of shape (batch_size, num_heads, seq_length, head_dim)
+        """
+        q, k, v, o, stats = ctx.saved_tensors
+        dq = dk = dv = None
+        # collect shapes and check for consistency
+        bq, hq, sq, dq = q.shape
+        bk, hk, sk, dk = k.shape
+        bv, hv, sv, dv = v.shape
+        bo, ho, so, do = o.shape
+        bs, hs, ss, ds = stats.shape
+        bdO, hdO, sdO, ddO = dO.shape
+        assert bq == bk == bv == bo == bs == bdO, "All batch sizes must be equal"
+        assert sq == so == ss == sdO, "Output and stats sequence lengths must match query sequence length"
+        assert hk == hv, "H_kv must be equal to H_kv"
+        assert sk == sv, "K and V sequence lengths must match"
+        assert hq == ho == hs == hdO, "Output and stats num heads must match num query heads"
+        assert ds == 1, "stats.shape[-1] != 1"
+        assert dq == dk == dv == do == ddO, "All head dimensions must be equal"
+        assert q.stride() == (sq * dq * hq, dq, dq * hq, 1), "q.stride() != (s*d*h, d, d*h, 1)"
+        assert k.stride() == (sk * dk * hk, dk, dk * hk, 1), "k.stride() != (s*d*h, d, d*h, 1)"
+        assert v.stride() == (sv * dv * hv, dv, dv * hv, 1), "v.stride() != (s*d*h, d, d*h, 1)"
+        assert o.stride() == (so * do * ho, do, do * ho, 1), "o.stride() != (s*d*h, d, d*h, 1)"
+        assert stats.stride() == (ss * hs, ss, 1, 1), "stats.stride() != (s*h, s, 1, 1)"
+        assert dO.stride() == (sdO * ddO * hdO, ddO, ddO * hdO, 1), "dO.stride() != (s*d*h, d, d*h, 1)"
+        assert q.dtype == k.dtype == v.dtype == o.dtype == dO.dtype, "All input/output tensors must have the same dtype"
+        # cuDNN compute all grads of GQA at once
+        graph = get_cudnn_gqa_bwd(bq, sq, hq, hk, dq, q.dtype)
+        dQ, dK, dV = graph(q, k, v, o, dO, stats)
+        return dQ, dK, dV
+
+
+@dataclasses.dataclass
+class LlamaConfig:
+    """Configuration class for LLaMA model hyperparameters.
+
+    This matches the configuration of LLaMA 3.1 8B model with:
+    - 32 transformer layers
+    - 4096 hidden dimension
+    - 32 attention heads
+    - Grouped-query attention with 8 key-value heads
+    """
+
+    vocab_size: int = 128256  # Size of the tokenizer vocabulary
+    max_position_embeddings: int = 131072  # Maximum sequence length
+    hidden_size: int = 4096  # Dimension of hidden layers
+    intermediate_size: int = 14336  # Dimension of MLP's hidden layer
+    num_hidden_layers: int = 32  # Number of transformer layers
+    num_attention_heads: int = 32  # Number of attention heads
+    num_key_value_heads: int = 8  # Number of key-value heads for GQA
+    rms_norm_eps: float = 1e-5  # Epsilon for RMSNorm
+
+
+def rotate_half(x: Tensor) -> Tensor:
+    """Rotates half the hidden dims of the input.
+
+    This is a helper function for rotary position embeddings (RoPE).
+    For a tensor of shape (..., d), it returns a tensor where the last
+    d/2 dimensions are rotated by swapping and negating.
+
+    Args:
+        x: Input tensor of shape (..., d)
+
+    Returns:
+        Tensor of same shape with rotated last dimension
+    """
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)  # Concatenate with rotation
+
+
+def apply_rotary_pos_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor:
+    """Apply rotary position embeddings to a tensor.
+
+    RoPE performs rotation in vector space based on position using
+    trigonometric functions. This allows the model to learn relative
+    positions in a more efficient way than absolute position embeddings.
+
+    Args:
+        x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        cos: Cosine position embeddings matching the shape of x
+        sin: Sine position embeddings matching the shape of x
+    """
+    return (x * cos) + (rotate_half(x) * sin)
+
+
+def get_inv_freq(N: float, dim: int) -> Tensor:
+    """Get the inverse frequency for the RoPE with the Llama 3.1 scaling.
+    Always computed in float32
+
+    Args:
+        N: Base, a large number
+        dim: Size of hidden dimension, should be divisible by 2
+    """
+    N = float(N)
+    dim = int(dim)
+    # Llama 3.1 RoPE parameters
+    factor = 8.0
+    low_freq, high_freq = 1.0, 4.0
+    context_len = 8192
+    # Compute the inverse frequency based on the standard RoPE formula
+    inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2).float().to("cuda") / dim))
+    # Compute the modified inverse frequency, then derive the smoothed inverse frequency
+    wavelen = 2 * math.pi / inv_freq
+    max_wavelen = context_len / low_freq
+    min_wavelen = context_len / high_freq
+    inv_freq = torch.where(wavelen > max_wavelen, inv_freq / factor, inv_freq)
+    smooth_factor = (context_len / wavelen - low_freq) / (high_freq - low_freq)
+    smoothed = (1 - smooth_factor) * inv_freq / factor + smooth_factor * inv_freq
+    # Output inverse frequency as a mix of the two
+    is_medium_freq = ~(wavelen < min_wavelen) * ~(wavelen > max_wavelen)
+    inv_freq_final = torch.where(is_medium_freq, smoothed, inv_freq)
+    return inv_freq_final
+
+
+class RotaryPositionEncoding(nn.Module):
+    """Rotary position encoding."""
+
+    def __init__(self, dim: int, max_position_embeddings: int) -> None:
+        """Initialize the RotaryPositionEncoding module
+
+        Args:
+            dim: The hidden dimension of the input tensor to which RoPE is applied
+            max_position_embeddings: The maximum sequence length of the input tensor
+        """
+        super().__init__()
+        # compute a matrix of n\theta_i
+        N = 500_000.0
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.inv_freq = get_inv_freq(N, dim)
+        position = torch.arange(max_position_embeddings).float().to("cuda")
+        inv_freq = torch.cat((self.inv_freq, self.inv_freq), dim=-1)
+        sinusoid_inp = torch.outer(position, inv_freq)
+        # save cosine and sine matrices as buffers, not parameters
+        self.register_buffer("cos", sinusoid_inp.cos())
+        self.register_buffer("sin", sinusoid_inp.sin())
+
+    def __repr__(self) -> str:
+        return f"RotaryPositionEncoding(dim={self.dim}, max_position_embeddings={self.max_position_embeddings})"
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply RoPE to tensor x
+
+        Args:
+            x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)
+
+        Returns:
+            Output tensor of shape (batch_size, seq_length, num_heads, head_dim)
+        """
+        dtype = x.dtype
+        seq_len = x.shape[1]
+        # transform the cosine and sine matrices to 4D tensor and the same dtype as x
+        cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, -1)
+        # apply RoPE to x
+        return apply_rotary_pos_emb(x, cos, sin)
+
+
+class LlamaMLP(nn.Module):
+    """MLP layer with SwiGLU activation.
+
+    The architecture follows:
+    1. Project input to intermediate size through two parallel layers
+    2. Apply SwiGLU activation (multiply gate and up-projected inputs)
+    3. Project back to hidden size
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        # Two parallel projections for SwiGLU
+        self.gate_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = Linear(config.hidden_size, config.intermediate_size, bias=False)
+        # Project back to hidden size
+        self.down_proj = Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = F.silu  # SwiGLU activation function
+
+    def forward(self, x: Tensor) -> Tensor:
+        # SwiGLU activation: multiply gate and up-projected inputs
+        gate = self.act_fn(self.gate_proj(x))
+        up = self.up_proj(x)
+        return self.down_proj(gate * up)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-head attention with grouped-query attention and rotary embeddings.
+
+    Grouped-query attention reduces computation by using fewer key-value heads
+    than query heads, then sharing the same key-value heads across multiple queries.
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError("hidden_size must be divisible by num_heads")
+
+        # Linear layers for Q, K, V projections
+        self.q_proj = Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        # Project inputs to Q, K, V
+        torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.qkv_proj")
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_kv_heads, self.head_dim)
+        torch.cuda.nvtx.range_pop()
+
+        # Apply rotary position embeddings
+        if rope is not None:
+            torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.rope")
+            query_states = rope(query_states)
+            key_states = rope(key_states)
+            torch.cuda.nvtx.range_pop()
+
+        # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # Use cuDNN's optimized attention implementation with causal mask
+        torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.gqa")
+        attn_output, _stats = CudnnGQA.apply(query_states, key_states, value_states)
+        torch.cuda.nvtx.range_pop()
+
+        # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, and then project output
+        attn_output = attn_output.transpose(1, 2).view(bsz, q_len, self.hidden_size)
+        torch.cuda.nvtx.range_push(":fwd.layer.attn.attn.o_proj")
+        attn_output = self.o_proj(attn_output)
+        torch.cuda.nvtx.range_pop()
+        return attn_output
+
+
+class LlamaDecoderLayer(nn.Module):
+    """Single transformer layer for LLaMA.
+
+    Architecture:
+    1. Input -> RMSNorm -> Self-Attention -> Residual
+    2. RMSNorm -> MLP -> Residual
+    """
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = LlamaAttention(config)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = LlamaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        rope: Optional[RotaryPositionEncoding] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        # First residual block: Self-attention
+        with torch.cuda.nvtx.range(":fwd.layer.attn"):
+            residual = hidden_states
+            with torch.cuda.nvtx.range(":fwd.layer.attn.prenorm"):
+                hidden_states = self.input_layernorm(hidden_states)
+            with torch.cuda.nvtx.range(":fwd.layer.attn.attn"):
+                attn_outputs = self.self_attn(hidden_states=hidden_states, rope=rope)
+            hidden_states = attn_outputs + residual
+
+        # Second residual block: MLP
+        with torch.cuda.nvtx.range(":fwd.layer.mlp"):
+            residual = hidden_states
+            with torch.cuda.nvtx.range(":fwd.layer.mlp.prenorm"):
+                hidden_states = self.post_attention_layernorm(hidden_states)
+            with torch.cuda.nvtx.range(":fwd.layer.mlp.mlp"):
+                hidden_states = self.mlp(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class LlamaModel(nn.Module):
+    """The full Llama model."""
+
+    def __init__(self, config: LlamaConfig) -> None:
+        super().__init__()
+        self.rotary_emb = RotaryPositionEncoding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings,
+        )
+
+        # Token embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        # Stack of transformer layers
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final layer norm
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        output_hidden_states: bool = False,
+    ) -> Tuple[Tensor, Optional[Tuple[Tensor, ...]]]:
+        # Convert input token IDs to embeddings
+        hidden_states = self.embed_tokens(input_ids)
+
+        # Initialize list to collect hidden states if requested
+        all_hidden_states = () if output_hidden_states else None
+
+        # Process through all transformer layers, accumulating hidden states as the input to each layer
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            with torch.cuda.nvtx.range(":fwd.layer"):
+                hidden_states = layer(hidden_states, rope=self.rotary_emb)
+
+        # Final layer norm, accumulate as the final hidden state
+        with torch.cuda.nvtx.range(":fwd.output_norm"):
+            hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # Return tuple of the output and the list of hidden states from all layers
+        return [hidden_states, all_hidden_states]
+
+
+# Create model with default config
+test_config = LlamaConfig()
+torch.set_default_device("cuda")
+model = LlamaModel(test_config).to(torch.bfloat16)
+print(time.time(), "model created")
+state_dict = torch.load("llama3.1_8b_weights.bf16.pt", map_location="cuda")
+print(time.time(), "state_dict loaded from disk")
+model.load_state_dict(state_dict, strict=False)
+print(time.time(), "model loaded")
+del state_dict
+print()
+
+# Warm up once the forward pass and backward pass
+BS, SEQ_LEN = 3, 10
+x = torch.randint(0, test_config.vocab_size, (BS, SEQ_LEN), device="cuda", dtype=torch.int64)
+y, hidden_states = model.forward(x, output_hidden_states=True)
+target = torch.randn_like(y)
+criterion = torch.nn.MSELoss()
+loss = criterion(y, target)
+loss.backward(retain_graph=True)  # warm up the cache
+torch.cuda.synchronize()
+print(time.time(), "Starting profiling")
+
+# Profile the forward pass and backward pass
+with torch.autograd.profiler.emit_nvtx():
+    with torch.cuda.nvtx.range(":forward_backward"):
+        with torch.cuda.nvtx.range(":fwd"):
+            y, hidden_states = model.forward(x, output_hidden_states=True)
+            torch.cuda.synchronize()
+        loss = criterion(y, target)
+        with torch.cuda.nvtx.range(":bwd"):
+            loss.backward(retain_graph=True)  # warm up the cache
+            torch.cuda.synchronize()
+
+print(time.time(), "Profiling completed")
diff --git a/third_party/cudnn-frontend/samples/llama/README.md b/third_party/cudnn-frontend/samples/llama/README.md
new file mode 100644
index 00000000..e6565356
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/README.md
@@ -0,0 +1,102 @@
+The code in this directory is a reference implementation of the Llama model in PyTorch and cuDNN.
+
+# Prerequisites
+
+The code in this directory depends on the following Python packages:
+- `torch`: PyTorch
+- `transformers`: Hugging Face transformers library
+- `tabulate`: Python library for formatting tables
+- `nvidia_cudnn_frontend`: cuDNN Frontend API (this repository)
+
+You can install these packages by running the following command:
+
+```
+pip install torch transformers tabulate nvidia_cudnn_frontend
+```
+
+Please reference to the [README.md](../../README.md) for more details on how to install the cuDNN Frontend API.
+
+<div style="text-align: center;">
+  <img src="llama-architecture.png" width="25%" /><br/>
+  Llama 3.1 8B model architecture
+</div>
+
+# Scripts
+
+**100_download_weight.py**
+
+This script downloads the Llama 3.1 8B model weights from Hugging Face and saves them in a PyTorch .pt file. It uses the Hugging Face transformers library. You also need to have a valid Hugging Face token to download the weights. Note that the Llama models are gated and you should have granted access to the model before you can download the weights with your token.
+
+Running this script will download the weights and save them into file `llama3.1_8b_weights.bf16.pt` in the current directory. This weight file will be used in the subsequent scripts.
+
+**101_hf_llama_tieout.py**
+
+This script creates a PyTorch implementation of the Llama 3.1 8B model using Hugging Face transformers library and loads the pretrained weights. Then randomize a small input tensor of integer token IDs, processed through the model, and perform a backward pass. The input, output, and the gradient tensors are saved into file `tensors-bf16-tieout.pt` in the current directory as a reference for the subsequent scripts.
+
+**102_torch_llama_tieout.py**
+
+This script creates a concise PyTorch implementation of the Llama 3.1 8B model. The code is cleaned up for clarity and readability but not optimized for performance. For example, KV cache is not implemented in this script.
+
+Running this script will load the tensors saved in `tensors-bf16-tieout.pt`, then reuse the input tensors to run a forward pass and a backward pass. The output, and the gradient tensors are compared with the reference tensors for numerical correctness.
+
+**103_cudnn_llama_tieout.py**
+
+This script replaces a few modules in the implementation of Llama 3.1 8B model from the script `102_torch_llama_tieout.py` with custom PyTorch implementations that uses cuDNN. Specifically, the `nn.Linear` module, `nn.RMSNorm` module, and `nn.functional.scaled_dot_product_attention` functions are replaced by a custom PyTorch function to use cuDNN.
+
+Running this script will load the tensors saved in `tensors-bf16-tieout.pt`, then reuse the input tensors to run a forward pass and a backward pass. The output, and the gradient tensors are compared with the reference tensors for numerical correctness.
+
+**104_torch_llama_nvtx.py**
+
+This is mostly the same as `102_torch_llama_tieout.py` that it launches a Llama model in PyTorch implementation and runs a forward pass and a backward pass. The code adds NVTX ranges to critical parts of the model to collect performance data.
+
+To run this script, you should have nsys installed. If you already installed CUDA toolkit, it should be in the path `/usr/local/cuda/bin/nsys`. Afterwards, you run the script with the command:
+
+```
+nsys profile -f true --gpu-metrics-devices=cuda-visible --export=sqlite -o 104_torch_llama_nvtx.nsys-rep \
+    python 104_torch_llama_nvtx.py
+```
+
+Then you will see two new files are created, `104_torch_llama_nvtx.nsys-rep` and `104_torch_llama_nvtx.nsys-rep.sqlite`. The performance data is stored in the SQLite file.
+
+**105_cudnn_llama_nvtx.py**
+
+Similar to `104_torch_llama_nvtx.py`, but this is using cuDNN modules for the Llama model implementation as in `103_cudnn_llama_tieout.py`. To run this script, you should run with the command:
+
+```
+nsys profile -f true --gpu-metrics-devices=cuda-visible --export=sqlite -o 105_cudnn_llama_nvtx.nsys-rep \
+    python 105_cudnn_llama_nvtx.py
+```
+
+Then you will see two new files are created, `105_cudnn_llama_nvtx.nsys-rep` and `105_cudnn_llama_nvtx.nsys-rep.sqlite`. The performance data is stored in the SQLite file.
+
+**decode_nvtx_profile.py**
+
+A script to extract the performance data from the SQLite file that created from running `104_torch_llama_nvtx.py` or `105_cudnn_llama_nvtx.py` as described above. You should give the SQLite file as an argument to the script:
+
+```
+python decode_nvtx_profile.py 104_torch_llama_nvtx.nsys-rep.sqlite
+```
+
+And the output of this script will be like:
+
+```
+Analyzing CUDA profile from: 104_torch_llama_nvtx.sqlite
+Found 8011 kernels, 14025 CPU calls, and 20796 events
+nvtx event                       num calls    kernel count    kernel time total
+-----------------------------  -----------  --------------  -------------------
+:fwd                                     1           1,257           10,955,357
+:fwd.layer                              32           1,248           10,895,549
+:fwd.layer.attn                         32             768            6,077,055
+:fwd.layer.attn.attn                    32             480            4,273,727
+:fwd.layer.attn.attn.gqa                32              32              522,816
+:fwd.layer.attn.attn.o_proj             32              32              527,104
+:fwd.layer.attn.attn.qkv_proj           32              96            1,536,703
+:fwd.layer.attn.attn.rope               32             320            1,687,104
+:fwd.layer.attn.prenorm                 32             256            1,698,816
+:fwd.layer.mlp                          32             480            4,818,494
+:fwd.layer.mlp.mlp                      32             192            3,014,238
+:fwd.layer.mlp.prenorm                  32             256            1,694,816
+:fwd.output_norm                         1               8               55,136
+```
+
+The table shows the number of time a NVTX event is encountered, the number of CUDA kernels involved in total, and the total time (nanoseconds) spent in these CUDA kernels.
diff --git a/third_party/cudnn-frontend/samples/llama/analysis.md b/third_party/cudnn-frontend/samples/llama/analysis.md
new file mode 100644
index 00000000..9fa9deb9
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/analysis.md
@@ -0,0 +1,91 @@
+# Analyzing Performance Differences Between PyTorch and cuDNN in Llama 3.1 8B Model
+
+On this page, we briefly illustrate the motivation of replacing several PyTorch modules with cuDNN and demonstrate how to measure the performance hotspots in the model and analyze the benefits of using cuDNN.
+
+A more thorough analysis of the training performance differences between PyTorch and cuDNN in Llama model is at [the benchmark directory](../../benchmark/Llama-3.2-1B-Training/).
+
+## Workflow
+
+To reproduce the results in this page, you should follow the following steps:
+
+1. Download the Llama 3.1 8B model weights from Hugging Face and save them in the current directory as `llama3.1_8b_weights.bf16.pt`, using the following command:
+
+    python 100_download_weight.py
+
+2. Run the script `104_torch_llama_nvtx.py` and  `105_cudnn_llama_nvtx.py` to profile the GPU performance of the PyTorch and cuDNN implementations. They will store the performance data in the SQLite files `104_torch_llama_nvtx.sqlite` and `105_cudnn_llama_nvtx.sqlite` respectively. You should run the following commands:
+
+    nsys profile -f true --gpu-metrics-devices=cuda-visible --export=sqlite -o 104_torch_llama_nvtx.nsys-rep \
+        python 104_torch_llama_nvtx.py
+
+    nsys profile -f true --gpu-metrics-devices=cuda-visible --export=sqlite -o 105_cudnn_llama_nvtx.nsys-rep \
+        python 105_cudnn_llama_nvtx.py
+
+3. Analyze the performance data using the script `decode_nvtx_profile.py`. The result will be printed to the console. You should run the following commands:
+
+    python decode_nvtx_profile.py 104_torch_llama_nvtx.sqlite
+    python decode_nvtx_profile.py 105_cudnn_llama_nvtx.sqlite
+
+
+## Results
+
+Below is the output of the final step of the aforementioned workflow on a NVIDIA B100 GPU:
+
+```
+Analyzing CUDA profile from: 104_torch_llama_nvtx.sqlite
+Found 8011 kernels, 14025 CPU calls, and 20796 events
+nvtx event                       num calls    kernel count    kernel time total
+-----------------------------  -----------  --------------  -------------------
+:fwd                                     1           1,257            6,968,861
+:fwd.layer                              32           1,248            6,936,989
+:fwd.layer.attn                         32             768            3,507,999
+:fwd.layer.attn.attn                    32             480            2,477,727
+:fwd.layer.attn.attn.gqa                32              32              313,248
+:fwd.layer.attn.attn.o_proj             32              32              345,280
+:fwd.layer.attn.attn.qkv_proj           32              96              864,896
+:fwd.layer.attn.attn.rope               32             320              954,303
+:fwd.layer.attn.prenorm                 32             256              965,216
+:fwd.layer.mlp                          32             480            3,428,990
+:fwd.layer.mlp.mlp                      32             192            2,401,055
+:fwd.layer.mlp.prenorm                  32             256              959,711
+:fwd.output_norm                         1               8               28,896
+
+Analyzing CUDA profile from: 105_cudnn_llama_nvtx.sqlite
+Found 5475 kernels, 14155 CPU calls, and 15246 events
+nvtx event                       num calls    kernel count    kernel time total
+-----------------------------  -----------  --------------  -------------------
+:fwd                                     1             802            4,936,635
+:fwd.layer                              32             800            4,931,195
+:fwd.layer.attn                         32             544            2,426,428
+:fwd.layer.attn.attn                    32             480            2,279,869
+:fwd.layer.attn.attn.gqa                32              32              144,128
+:fwd.layer.attn.attn.o_proj             32              32              336,415
+:fwd.layer.attn.attn.qkv_proj           32              96              844,831
+:fwd.layer.attn.attn.rope               32             320              954,495
+:fwd.layer.attn.prenorm                 32              32               83,231
+:fwd.layer.mlp                          32             256            2,504,767
+:fwd.layer.mlp.mlp                      32             192            2,353,343
+:fwd.layer.mlp.prenorm                  32              32               84,160
+:fwd.output_norm                         1               1                2,528
+```
+
+Rearranging, the following table summarizes the performance data:
+
+| nvtx event | num calls | pytorch kernel count | pytorch kernel time total | cudnn kernel count | cudnn kernel time total | cudnn speedup |
+|------------|-----------|--------------|-------------------|--------------|-------------------|---------------|
+| :fwd | 1 | 1257 | 6,968,861.00 | 802 | 4,936,635.00 | 41.2% |
+| :fwd.layer | 32 | 1248 | 6,936,989.00 | 800 | 4,931,195.00 | 40.7% |
+| :fwd.layer.attn | 32 | 768 | 3,507,999.00 | 544 | 2,426,428.00 | 44.6% |
+| :fwd.layer.attn.attn | 32 | 480 | 2,477,727.00 | 480 | 2,279,869.00 | 8.7% |
+| :fwd.layer.attn.attn.gqa | 32 | 32 | 313,248.00 | 32 | 144,128.00 | 117.3% |
+| :fwd.layer.attn.attn.o_proj | 32 | 32 | 345,280.00 | 32 | 336,415.00 | 2.6% |
+| :fwd.layer.attn.attn.qkv_proj | 32 | 96 | 864,896.00 | 96 | 844,831.00 | 2.4% |
+| :fwd.layer.attn.attn.rope | 32 | 320 | 954,303.00 | 320 | 954,495.00 | 0.0% |
+| :fwd.layer.attn.prenorm | 32 | 256 | 965,216.00 | 32 | 83,231.00 | 1059.7% |
+| :fwd.layer.mlp | 32 | 480 | 3,428,990.00 | 256 | 2,504,767.00 | 36.9% |
+| :fwd.layer.mlp.mlp | 32 | 192 | 2,401,055.00 | 192 | 2,353,343.00 | 2.0% |
+| :fwd.layer.mlp.prenorm | 32 | 256 | 959,711.00 | 32 | 84,160.00 | 1040.3% |
+| :fwd.output_norm | 1 | 8 | 28,896.00 | 1 | 2,528.00 | 1043.0% |
+
+The table shows the number of time a NVTX event is encountered, the number of CUDA kernels involved in total, and the total time (nanoseconds) spent in these CUDA kernels. The cuDNN speedup is calculated as the ratio of the PyTorch kernel time total to the cuDNN kernel time total.
+
+From there, we can see that the cuDNN implementation is generally faster than the PyTorch implementation, especially the normalization layers. In fact switching the normalization implementation to cuDNN is the major contributor to make the forward pass 41% faster. Comparatively, our method of replacing the linear layers does not provide any material advantage. This is an example of how you can experiment in a larger model to find the benefits of using cuDNN.
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/samples/llama/decode_nvtx_profile.py b/third_party/cudnn-frontend/samples/llama/decode_nvtx_profile.py
new file mode 100644
index 00000000..e1fae74c
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/llama/decode_nvtx_profile.py
@@ -0,0 +1,220 @@
+import os
+import bisect
+import sqlite3
+from typing import Optional
+
+
+SQL = {
+    # CUDA kernel
+    "kernel": """
+        SELECT
+            k.start,
+            k.end,
+            k.correlationId,
+            s.value as demangledName
+        FROM CUPTI_ACTIVITY_KIND_KERNEL k
+        LEFT JOIN StringIds s ON k.demangledName = s.id
+    """,
+    # CPU calls
+    "cpu": """
+        SELECT
+            c.start,
+            c.end,
+            c.correlationId
+        FROM CUPTI_ACTIVITY_KIND_RUNTIME c
+        ORDER BY c.start ASC, c.end DESC
+    """,
+    # NVTX events
+    "nvtx": """
+        SELECT
+            n.start,
+            n.end,
+            n.text
+        FROM NVTX_EVENTS n
+        WHERE n.text IS NOT NULL AND n.end IS NOT NULL
+        ORDER BY n.start ASC, n.end DESC
+    """,
+}
+
+
+def get_data(conn) -> tuple[list[dict], list[dict], list[dict]]:
+    """Query the database for kernel, memory copy, CPU call, and NVTX event data
+
+    Args:
+        conn: SQLite connection object
+
+    Returns:
+        A tuple of (kernels, cpus, events) for each is a list of dictionaries.
+    """
+    cur = conn.cursor()
+
+    # Get kernels
+    cur.execute(SQL["kernel"])
+    columns: list[str] = [desc[0] for desc in cur.description]
+    kernels = [dict(zip(columns, row)) for row in cur.fetchall()]
+
+    # Get CPU calls
+    cur.execute(SQL["cpu"])
+    columns: list[str] = [desc[0] for desc in cur.description]
+    cpus = [dict(zip(columns, row)) for row in cur.fetchall()]
+
+    # Get NVTX events
+    cur.execute(SQL["nvtx"])
+    columns: list[str] = [desc[0] for desc in cur.description]
+    events = [dict(zip(columns, row)) for row in cur.fetchall()]
+
+    return kernels, cpus, events
+
+
+def find_event(op: dict, events: list[dict], exclude: Optional[int] = None) -> int:
+    """Find the narrowest NVTX event that encompasses the operation.
+
+    Args:
+        op: The operation to find the event for. Assumes to have start and end times.
+        events: List of NVTX events. Each event has a start and end time and the list is sorted.
+        exclude: Index of the event to exclude from the search.
+
+    Returns:
+        The index of the event in the list that encompasses the operation, or None if no event is found.
+
+    Notes:
+        The list is sorted using (start, end) as key. Therefore, scanning
+        backwards should find the narrowest event.
+    """
+    # use bisect to find the starting point, then scan backwards
+    end_idx = bisect.bisect_right(events, op["start"], key=lambda x: x["start"])
+    for i in range(min(end_idx, len(events) - 1), -1, -1):
+        if i == exclude:
+            continue
+        event = events[i]
+        if op["start"] >= event["start"] and op["end"] <= event["end"]:
+            return i
+    return None
+
+
+def aggregate_statistics(event: dict, events: list[dict], cpus: list[dict]):
+    """Aggregate the statistics into the event tree"""
+    if "kernel_count" in event:
+        return  # this event has already been visited
+    stats = {
+        "kernel_count": 0,
+        "kernel_time": 0,
+    }
+    for cpu_idx in event.get("cpu", []):
+        call = cpus[cpu_idx]
+        for kernel in call.get("kernel", []):
+            stats["kernel_count"] += 1
+            stats["kernel_time"] += kernel["end"] - kernel["start"]
+    for child_idx in event.get("children", []):
+        child = events[child_idx]
+        aggregate_statistics(child, events, cpus)
+        stats["kernel_count"] += child["kernel_count"]
+        stats["kernel_time"] += child["kernel_time"]
+    event.update(stats)
+
+
+def analyze_cuda_profile(sqlite_file: str, top_event: str = ":fwd") -> list[dict]:
+    """
+    Summarize a CUDA profile SQLite database, focusing on kernel launches as aggregated by a NVTX event.
+
+    Args:
+        sqlite_file: Path to the SQLite database file
+        top_event: The topmost level NVTX event to analyze. Defaults to ":fwd".
+
+    Returns:
+        List of dictionaries of each NVTX events reporting the statistics of
+        kernel launches.
+    """
+    if not os.path.exists(sqlite_file):
+        print(f"Error: File {sqlite_file} does not exist.")
+        return []
+
+    print(f"Analyzing CUDA profile from: {sqlite_file}")
+
+    # Get kernel, memcpy, and event information
+    conn = sqlite3.connect(sqlite_file)
+    conn.row_factory = sqlite3.Row  # This allows accessing columns by name
+    kernels, cpus, events = get_data(conn)
+    conn.close()
+
+    print(f"Found {len(kernels)} kernels, {len(cpus)} CPU calls, and {len(events)} events")
+
+    # Look for the specified topmost level event, clip the events to within this event,
+    # and keep only the events that start with ":" prefix (those we want to analyze).
+    try:
+        event = [e for e in events if e["text"] == top_event][0]
+        events = [
+            e for e in events if e["start"] >= event["start"] and e["end"] <= event["end"] and e["text"].startswith(":")
+        ]
+    except IndexError:
+        print(f"Error: Could not find event '{top_event}'")
+        return []
+
+    # Assign kernel launches to CPU calls based on the correlation ID
+    corr_lookup = {cpu["correlationId"]: idx for idx, cpu in enumerate(cpus)}
+    for kernel in kernels:
+        if kernel["correlationId"] not in corr_lookup:
+            continue  # ignore if can't find the correlation ID
+        call = cpus[corr_lookup[kernel["correlationId"]]]
+        if "kernel" not in call:
+            call["kernel"] = []
+        call["kernel"].append(kernel)
+
+    # Associate CPU calls to the narrowest NVTX event
+    for idx, call in enumerate(cpus):
+        event_idx = find_event(call, events)
+        if event_idx is None:
+            continue
+        event = events[event_idx]
+        if "cpu" not in event:
+            event["cpu"] = []
+        event["cpu"].append(idx)
+
+    # Find the immediate parent event of each event
+    for idx, event in enumerate(events):
+        parent_idx = find_event(event, events, exclude=idx)
+        if parent_idx is None:
+            continue
+        event["parent"] = parent_idx
+        parent = events[parent_idx]
+        if "children" not in parent:
+            parent["children"] = []
+        parent["children"].append(idx)
+
+    # Recursively aggregate the statistics into the event tree
+    for event in events:
+        aggregate_statistics(event, events, cpus)
+
+    # summarize the statistics based on the event name
+    results = {}
+    for event in events:
+        event_name = event["text"]
+        if event_name not in results:
+            results[event_name] = {"nvtx event": event_name}
+            results[event_name].update({n: 0.0 for n in ["num calls", "kernel count", "kernel time total"]})
+        results[event_name]["num calls"] += 1
+        results[event_name]["kernel count"] += event["kernel_count"]
+        results[event_name]["kernel time total"] += event["kernel_time"]
+    # sort results by kernel_time
+    results = sorted(results.values(), key=lambda x: x["nvtx event"])
+    return results
+
+
+if __name__ == "__main__":
+    import argparse
+    import tabulate
+
+    parser = argparse.ArgumentParser(description="Analyze CUDA profile SQLite database")
+    parser.add_argument("sqlite_file", help="Path to the SQLite database file or 'default' for the default file")
+    parser.add_argument(
+        "--top-event",
+        help="The topmost level NVTX event to analyze. Such as ':forward_backward' or ':fwd'.",
+        default=":fwd",
+    )
+    args = parser.parse_args()
+
+    # Analyze with filter applied
+    # use :forward_backward to show timing for both forward and backward passes
+    # or use :fwd to show timing for only the forward pass
+    results = analyze_cuda_profile(args.sqlite_file, top_event=args.top_event)
+    print(tabulate.tabulate(results, headers="keys", tablefmt="simple", floatfmt=",.0f"))
diff --git a/third_party/cudnn-frontend/samples/llama/llama-architecture.png b/third_party/cudnn-frontend/samples/llama/llama-architecture.png
new file mode 100644
index 00000000..6e43bd30
Binary files /dev/null and b/third_party/cudnn-frontend/samples/llama/llama-architecture.png differ
diff --git a/third_party/cudnn-frontend/samples/python/00_introduction.ipynb b/third_party/cudnn-frontend/samples/python/00_introduction.ipynb
new file mode 100644
index 00000000..ab92ee14
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/00_introduction.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction to cuDNN Frontend Python API\n",
+    "This notebook is an introduction to cuDNN FE graph Python API and how to perform a single fprop convolution"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/00_introduction.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cuDNN Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The simplest way to use cuDNN through the frontend API is to use the `Graph` wrapper object. Below is an example of how to perform a single fprop convolution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "\n",
+    "print(cudnn.backend_version())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`backend_version()` returns an integer representing the cudnn backend version, e.g. 90000. You can use this to check if the cuDNN backend version supports the operations you need.\n",
+    "\n",
+    "In the following, we will use PyTorch to hold a random tensor on the GPU and then use the cuDNN to perform a convolution operation. Let's create the tensors in PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "assert torch.cuda.is_available()\n",
+    "device = torch.device(\"cuda\")\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "# Create tensor in NHWC format then permute to NCHW\n",
+    "X_gpu = torch.randn(8, 56, 56, 64, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 3, 3, 64, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This creates two PyTorch tensors in GPU, with [physical layout in NHWC](https://pytorch.org/blog/accelerating-pytorch-vision-models-with-channels-last-on-cpu/) but logical layout in NCHW. This is the format [expected by cuDNN](https://docs.nvidia.com/deeplearning/cudnn/frontend/latest/developer/core-concepts.html#tensor-descriptor).\n",
+    "\n",
+    "`handle` is a pointer to an opaque structure holding the cuDNN library context. The cuDNN library context must be created using `create_handle()` and the returned handle must be passed to all subsequent library function calls as needed. The context should be destroyed at the end using `destroy_handle()`. \n",
+    "\n",
+    "The context is associated with only one GPU device, the current device at the time of the call to `create_handle()`. However, multiple contexts can be created on the same GPU device.\n",
+    "\n",
+    "`handle` is used during the execute and to determine which GPU the kernel should be launched. \n",
+    "\n",
+    "The next step is to create a `pygraph` object so that a graph can be created:\n",
+    "\n",
+    "Then we can create a cuDNN graph for convolution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    inputs=[\"conv2d::image\", \"conv2d::weight\"],\n",
+    "    outputs=[\"conv_out\"],\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X_gpu,  # referencing tensor layout and type\n",
+    "        weight=W_gpu,  # referencing tensor layout and type\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        compute_data_type=cudnn.data_type.FLOAT,\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    # either set io_data_type in Graph or set_data_type on the output tensor\n",
+    "    Y.set_output(True).set_data_type(cudnn.data_type.HALF).set_name(\"conv_out\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This code created a cuDNN graph with a single node of convolution operation. The node is assigned the name `conv2d`. It will consume two input tensors, as `image` and `weight`. The output of this node, held by the variable `Y`, will be referenced by the name `conv_out` and regarded as the output of the graph.\n",
+    "\n",
+    "The graph is created with a context manager, which will automatically finalize the graph after the block is exited. The graph has been assigned the input and output tuples `[\"conv2d::image\", \"conv2d::weight\"]` and `[\"conv_out\"]` respectively. These means the input will be `image` argument in the node named `conv2d` and then `weight` argument in the same node. The output has the name assigned, hence, alternatively, you can also use that name.\n",
+    "\n",
+    "There are several features shown above:\n",
+    "\n",
+    "- `Graph` is the main class for creating a cuDNN graph using the context manager.\n",
+    "- once a graph is initialized, you can build the graph by creating nodes one by one\n",
+    "- even you provided PyTorch tensor `X_gpu` as argument `image` to the node `conv2d`, it is only used as a placeholder to define the properties of the input tensor (e.g. data type, physical layout, logical layout). The actual data is not used at this stage.\n",
+    "- output tensors from a node (cuDNN tensor object) should be defined as output using `set_output(True)` to indicate that the graph should hold the reference to the tensor to return to the user\n",
+    "- cuDNN graph is flexible, hence you would need to provide additional information to set up the graph, such as the data type of the output from a node, if there is an ambiguity.\n",
+    "\n",
+    "Once you finish defining the graph, you can execute it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_gpu = graph(\n",
+    "    X_gpu, W_gpu, handle=handle\n",
+    ")  # reading the tensor value to execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The graph is called like a function. The input arguments are the actual tensors to be used and the order of the input arguments is as defined using `set_io_tuples()`. The output from the graph is a PyTorch tensor created dynamically.\n",
+    "\n",
+    "Note that we used `X_gpu` and `W_gpu` when we defined the graph previously and reused them here. It is not necessary. But you should make sure the tensors you used to define the graph are compatible (data type and layouts) with the tensors you invoked the graph with.\n",
+    "\n",
+    "To verify that the graph is working correctly, you can check the result with that generated by PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With the same input tensors `X_gpu` and `W_gpu`, you can tell that the result from cuDNN `Y_gpu` and the result from PyTorch `Y_ref` are numerically close."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using cuDNN Frontend Python Bindings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a `pygraph` object so that a graph can be created:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    name=\"cudnn_graph_0\",\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "An `pygraph` object is the subgraph that is provided to the cuDNN for execution.\n",
+    "\n",
+    "The arguments you provided to create the `pygraph` object are optional:\n",
+    "\n",
+    "- You can assign a `name` to the graph for future reference.\n",
+    "- The `io_data_type` provides the data type of the input and output tensors of the graph. This can be overridden by actual tensor data type.\n",
+    "- The `compute_data_type` provides the data type in which computation will happen. This can be overridden by actual compute data type of the individual operation.\n",
+    "\n",
+    "With the `pygraph` object created, you can define nodes in this graph. All tensors used by a node should be specified as a cuDNN `tensor` object. For the convolution operation in the previous example, we need to create two tensors, `X` and `W`, respectively:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = graph.tensor(\n",
+    "    name=\"X\",\n",
+    "    dim=[8, 64, 56, 56],\n",
+    "    stride=[56 * 56 * 64, 1, 56 * 64, 64],\n",
+    "    data_type=cudnn.data_type.HALF,\n",
+    ")\n",
+    "W = graph.tensor(name=\"W\", dim=[32, 64, 3, 3], stride=[3 * 3 * 64, 1, 3 * 64, 64])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`graph.tensor` creates an entry edge to the graph. The main attributes of the tensor class are `dim`, `stride` and `data_type`. Some other attributes are `is_virtual` (mainly used for interior nodes in graph), `is_pass_by_value` for scalar tensors. Assigning a `name` to the tensor is optional.\n",
+    "\n",
+    "Note that the `W` tensor above was created without `data_type`. Its data type is deduced from the `io_data_type` of the `pygraph` object that was used to create the tensor.\n",
+    "\n",
+    "Next is the convolution node, which uses the cuDNN tensors `X` and `W` as input:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y = graph.conv_fprop(\n",
+    "    X,\n",
+    "    W,\n",
+    "    padding=[1, 1],\n",
+    "    stride=[1, 1],\n",
+    "    dilation=[1, 1],\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform a *convolution forward* operation with padding as `[1,1]` on the input `X` tensor. You can run `help (cudnn.pygraph.conv_fprop)` to see explanation of the other parameters, `compute_data_type`, `stride`, `dilation`.\n",
+    "\n",
+    "Note that when you use the Python binding directly, you must provide `X` and `W` as cuDNN tensors. Using other tensors, such as PyTorch tensors, is not allowed.\n",
+    "\n",
+    "The output of the convolution node above is a cuDNN tensor. Since you want to read from it, you should mark it as output using `set_output(True)`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y.set_output(True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By default the output of any operation is *virtual* (does not have device pointer associated). This is because the output can be fed as input to the next operation in graph. In order to terminate the graph, or to mark the tensor *non-virtual* we need to set it as output. Multiple tensors can be marked as output.\n",
+    "\n",
+    "At this point, the graph is defined. You should finalize it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "# print(graph)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Following things happen in the above call:\n",
+    "\n",
+    "- Validation of inputs, outputs and output shape deduction.\n",
+    "- Lowering pass into the cuDNN dialect.\n",
+    "- Heuristics query to determine which execution plan to run.\n",
+    "- Runtime compilation of the plan if needed\n",
+    "\n",
+    "This function can be split into its constituents to give you a better control over each phase.\n",
+    "\n",
+    "Once the graph is built, you can use `print()` to inspect the graph after the shape and datatype deduction. For example, the tensor `Y` above as the output from the convolution will have its shape and data type known only after the graph is built.\n",
+    "\n",
+    "You need to provide the actual data to execute the graph. Let's use PyTorch to create a few random tensors:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_gpu = torch.randn(\n",
+    "    8, 64, 56, 56, requires_grad=False, device=\"cuda\", dtype=torch.float16\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "W_gpu = torch.randn(\n",
+    "    32, 64, 3, 3, requires_grad=False, device=\"cuda\", dtype=torch.float16\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "Y_gpu = torch.zeros(\n",
+    "    8, 32, 56, 56, requires_grad=False, device=\"cuda\", dtype=torch.float16\n",
+    ").to(memory_format=torch.channels_last)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These tensors reside in the GPU. They are in the \"channel last\" physical layout (required for cuDNN convolution operations). You are not required to use PyTorch; cuDNN also supports other dlpack tensors on the GPU.\n",
+    "\n",
+    "To execute the graph, you need to create a workspace and provide a mapping between the cuDNN tensors and the actual allocated tensors (the \"variant pack\"):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "variant_pack = {X: X_gpu, W: W_gpu, Y: Y_gpu}\n",
+    "graph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The workspace is a buffer that the graph can use during execution. You should allocate a piece of memory of sufficiently large on the GPU. The size can be computed from the graph using `get_workspace_size()`.\n",
+    "\n",
+    "The execute call launches the kernel for execution on the GPU device. The output, which `Y` was assigned to, will be populated to `Y_gpu`.\n",
+    "\n",
+    "You can verify the result with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, you can destroy the handle when you no longer need it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/01_graph_building.ipynb b/third_party/cudnn-frontend/samples/python/01_graph_building.ipynb
new file mode 100644
index 00000000..5099d67b
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/01_graph_building.ipynb
@@ -0,0 +1,732 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Overview of the cuDNN Wrapper\n",
+    "\n",
+    "This notebook explains the features and capabilities of the cuDNN wrapper."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/01_graph_building.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating the wrapper object"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `Graph` wrapper is an object that provides a context manager around a cuDNN `pygraph` object. It can accept all arguments that are accepted by the `pygraph` object, for example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\")\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "# Create NCHW tensor but physical layout is NHWC\n",
+    "X_gpu = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "with cudnn.Graph(\n",
+    "    name=\"graph_0\",\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X_gpu,  # referencing tensor layout and type only\n",
+    "        weight=W_gpu,  # referencing tensor layout and type only\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_output(True).set_name(\"conv_out\")\n",
+    "\n",
+    "graph.set_io_tuples([\"conv2d::image\", \"conv2d::weight\"], [\"conv_out\"])\n",
+    "Y_gpu = graph(\n",
+    "    X_gpu, W_gpu, handle=handle\n",
+    ")  # reading the tensor values to execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When you create a `Graph` object, you can pass through the arguments to the `pygraph` object. All arguments to the `Graph` object must be keyword arguments instead of positional arguments. Besides the arguments that are accepted by the `pygraph` object, some other arguments are also accepted:\n",
+    "\n",
+    "- `handle`: While `pygraph` object also accepts a handle as argument, if you provided one, it will be reused for the graph execution by default\n",
+    "- `heuristics`: A list of cuDNN heuristics to use to build the graph. The default is `[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK]`\n",
+    "- `inputs` and `outputs`: A list of input and output specifications. This is used in place of calling `set_io_tuples()` explicitly after the graph is created.\n",
+    "\n",
+    "You can rewrite the previous example with the `inputs` and `outputs` arguments, as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    name=\"graph_0\",\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"conv2d::image\", \"conv2d::weight\"],\n",
+    "    outputs=[\"conv_out\"],\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X_gpu,  # referencing tensor layout and type only\n",
+    "        weight=W_gpu,  # referencing tensor layout and type only\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_output(True).set_name(\"conv_out\")\n",
+    "\n",
+    "Y_gpu = graph(\n",
+    "    X_gpu, W_gpu, handle=handle\n",
+    ")  # reading the tensor values to execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can see that the line of calling `set_io_tuples()` is removed. Instead, you need to specify `inputs` and `outputs` as arguments to the `Graph` object so that you can call the graph with positional arguments.\n",
+    "\n",
+    "In the examples above, you created the `Graph` object with the `io_data_type` and `compute_data_type` arguments. They are necessary in this particular example because otherwise the node `conv2d` will not know what precision it should use for the internal computations and the output tensor `Y` will not know what precision it should have.\n",
+    "\n",
+    "The `io_data_type` and `compute_data_type` arguments are optional as they are to set as \"default\" for those not specified. You can omit them if you already specified them at the nodes and tensors that need them, like the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    inputs=[\"conv2d::image\", \"conv2d::weight\"],\n",
+    "    outputs=[\"conv_out\"],\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X_gpu,  # referencing tensor layout and type only\n",
+    "        weight=W_gpu,  # referencing tensor layout and type only\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        compute_data_type=cudnn.data_type.FLOAT,\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_data_type(cudnn.data_type.HALF).set_output(True).set_name(\"conv_out\")\n",
+    "\n",
+    "Y_gpu = graph(\n",
+    "    X_gpu, W_gpu, handle=handle\n",
+    ")  # reading the tensor values to execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is the same as the previous example. But the `Graph` object now does not have the default data type for the I/O and computation. You must specify them at all tensors and nodes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tensors to use in a wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "One major difference between the `Graph` wrapper and the underlying `pygraph` object from Python binding is that the `Graph` wrapper accepts dlpack tensors transparently without the need to convert them to cuDNN tensors.\n",
+    "\n",
+    "Consider the following example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"conv2d::image\", \"conv2d::weight\"],\n",
+    "    outputs=[\"conv_out\"],\n",
+    ") as graph:\n",
+    "    X = graph.tensor(\n",
+    "        name=\"X\",\n",
+    "        dim=[8, 64, 56, 56],\n",
+    "        stride=[56 * 56 * 64, 1, 56 * 64, 64],\n",
+    "        data_type=cudnn.data_type.HALF,\n",
+    "    )\n",
+    "    W = graph.tensor(\n",
+    "        name=\"W\",\n",
+    "        dim=[32, 64, 3, 3],\n",
+    "        stride=[64 * 3 * 3, 1, 64 * 3, 64],\n",
+    "    )\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X,  # using a cuDNN tensor object for layout and type\n",
+    "        weight=W,  # using a cuDNN tensor object for layout and type\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_output(True).set_name(\"conv_out\")\n",
+    "\n",
+    "device = torch.device(\"cuda\")\n",
+    "X_gpu = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "Y_gpu = graph(\n",
+    "    X_gpu, W_gpu, handle=handle\n",
+    ")  # using dlpack tensors for values to execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is same as the examples above, but you used the cuDNN tensors `X` and `W` to create the graph. This is not necessary but you can do so if you want to. The tensors `X` and `W` are just placeholders to specify the tensor attributes such as memory layout. No GPU memory is allocated for them.\n",
+    "\n",
+    "The actual GPU memory allocation happens when you defined `X_gpu` and `W_gpu` later. But you can see that the graph is the same either you defined the graph with cuDNN tensors or dlpack tensors.\n",
+    "\n",
+    "PyTorch tensors are not the only option for you. You can also use other dlpack tensors on the GPU, such as CuPy arrays:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cupy as cp\n",
+    "\n",
+    "# Create tensors with physical layout NHWC and logical layout NCHW\n",
+    "X_cupy = cp.random.randn(8, 56, 56, 64).astype(cp.float16).transpose(0, 3, 1, 2)\n",
+    "W_cupy = cp.random.randn(32, 3, 3, 64).astype(cp.float16).transpose(0, 3, 1, 2)\n",
+    "\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"conv2d::image\", \"conv2d::weight\"],\n",
+    "    outputs=[\"conv_out\"],\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X_cupy,  # using a CuPy array for layout and type\n",
+    "        weight=W_cupy,  # using a CuPy array for layout and type\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_output(True).set_name(\"conv_out\")\n",
+    "\n",
+    "Y_gpu = graph(\n",
+    "    X_cupy, W_cupy, handle=handle\n",
+    ")  # using CuPy arrays for values to execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Pay attention that even for CuPy tensors, you still need to comply with the data type and layout requirements for that particular operation.\n",
+    "\n",
+    "You know the above code works by verifying the above result with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_ref = torch.tensor(X_cupy)\n",
+    "W_ref = torch.tensor(W_cupy)\n",
+    "Y_ref = torch.nn.functional.conv2d(X_ref, W_ref, padding=1)\n",
+    "\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Executing the graph from wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the examples above, you created a `Graph` object, defined the graph, and set the inputs and outputs as a tuple. Then you can execute the graph as if it were a function.\n",
+    "\n",
+    "This is not the only way to execute the graph. You can execute the graph with a dictionary mapping the inputs and outputs to allocated tensors:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_gpu = torch.randn(8, 56, 56, 64, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 3, 3, 64, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")\n",
+    "\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF, compute_data_type=cudnn.data_type.FLOAT\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        image=X_gpu,\n",
+    "        weight=W_gpu,\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_output(True)\n",
+    "\n",
+    "# allocate the output tensor\n",
+    "Y_gpu = torch.zeros(8, 56, 56, 32, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")\n",
+    "# execute the graph with a dictionary, output tensors will be updated in place\n",
+    "output = graph(\n",
+    "    {\"conv2d::image\": X_gpu, \"conv2d::weight\": W_gpu, \"conv2d::Y\": Y_gpu}, handle=handle\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The example above created a graph as before. But you did not set `inputs` and `outputs` in the `Graph` object, nor did you call `set_io_tuples()` after the graph is created. This way, you need to execute the graph by passing in a Python dictionary. The keys of the dictionary are the names of the inputs and outputs and the values of the dictionary are allocated tensors. The input tensors are where the values are read from, and the output tensors are where the values are written to.\n",
+    "\n",
+    "In the graph above, there is only one node, `conv2d` from `conv_fprop()`. The output tensor will be named as `conv2d::Y` by default. The `conv2d` part is the name of the node and `Y` part is the name of the output tensor from the node. You can check the tensor name from any node from the cuDNN documentation. This is how  you refer to the output tensor from the graph above since you have not use `set_name()` to assign a new name to it.\n",
+    "\n",
+    "You created `Y_gpu` as a tensor of all zeros. You know that the result has been written to `Y_gpu` by comparing its value with the reference result from PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `output` returned from executing the graph is the same dictionary as the one you used as the argument to the graph exeuction. The reason for this syntax is that you can skip creating the output tensors, like the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X2_gpu = torch.randn(8, 56, 56, 64, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")\n",
+    "W2_gpu = torch.randn(32, 3, 3, 64, device=device, dtype=torch.float16).permute(\n",
+    "    0, 3, 1, 2\n",
+    ")\n",
+    "\n",
+    "# Same graph as before will be reused\n",
+    "# with cudnn.Graph(\n",
+    "#     io_data_type=cudnn.data_type.HALF,\n",
+    "#     compute_data_type=cudnn.data_type.FLOAT\n",
+    "# ) as graph:\n",
+    "#    Y = graph.conv_fprop(\n",
+    "#        image=X_gpu,\n",
+    "#        weight=W_gpu,\n",
+    "#        padding=[1, 1],\n",
+    "#        stride=[1, 1],\n",
+    "#        dilation=[1, 1],\n",
+    "#        name=\"conv2d\",\n",
+    "#    )\n",
+    "#    Y.set_output(True)\n",
+    "\n",
+    "output = graph({\"conv2d::image\": X2_gpu, \"conv2d::weight\": W2_gpu}, handle=handle)\n",
+    "Y2_gpu = output[\"conv2d::Y\"]  # new key is created in dict for the outputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When you execute the graph, the dictionary that used as argument does not have the key `conv2d::Y`. But when it returns, that key was created and the value is a new tensor created by the wrapper. You can retrieve that output tensor and verify its result with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y2_ref = torch.nn.functional.conv2d(X2_gpu, W2_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y2_gpu, Y2_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the examples above, you saw that you can execute the graph using positional arguments or a dictionary. The former requires you execute `set_io_tuples()` explicitly or implicitly. Either way, you need to reference to the input and output tensors. There are multiple ways to do this:\n",
+    "\n",
+    "- Use a string in the format of `\"node_name::tensor_name\"`\n",
+    "- Use a string of the name assigned to the cuDNN tensor\n",
+    "- Use an integer of the uid assigned to the cuDNN tensor\n",
+    "- Use the cuDNN tensor object directly\n",
+    "\n",
+    "Let's see an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_gpu = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        X_gpu,  # tensor in a positional argument\n",
+    "        weight=W_gpu,  # tensor in a keyword argument\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "        name=\"conv2d\",\n",
+    "    )\n",
+    "    Y.set_output(True)\n",
+    "\n",
+    "# specify the input and output tensors and then execute the graph\n",
+    "graph.set_io_tuples([\"conv2d::0\", \"conv2d::weight\"], [Y])\n",
+    "Y_gpu = graph(X_gpu, W_gpu, handle=handle)\n",
+    "\n",
+    "Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are two main differences in the examples above. First, `conv_fprop()` uses the positional argument `X_gpu` instead of the `image` keyword argument. Second, `set_io_tuples()` is called differently.\n",
+    "\n",
+    "Since `X_gpu` (i.e., `image`) is passed as a positional argument, it is referenced as `conv2d::0`, where `0` indicates it is the first positional argument. The output tensor `Y` is passed directly as a cuDNN tensor object, rather than being referenced as `conv2d::Y`, though both are equivalent.\n",
+    "\n",
+    "In case you did not name the convolution node, you name your input and output tensors using the syntax `node_op.number::tensor_name`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_gpu = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        X_gpu,\n",
+    "        weight=W_gpu,\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "    )\n",
+    "    Y.set_output(True)\n",
+    "\n",
+    "graph.set_io_tuples([\"conv_fprop.0::0\", \"conv_fprop.0::weight\"], [\"conv_fprop.0::Y\"])\n",
+    "Y_gpu = graph(X_gpu, W_gpu, handle=handle)\n",
+    "\n",
+    "Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By removing the `name` argument from the `conv_fprop()` call, you now need to reference to the input tensors as `conv_fprop.0::0` and `conv_fprop.0::weight`. In `conv_fprop.0::0`, the part `conv_fprop.0` means it is the first node in the graph that uses the `conv_fprop` operation. The `::0` part means it is the first positional argument. Similarly, `conv_fprop.0::weight` means the keyword argument `weight` of that node.\n",
+    "\n",
+    "You may find that referencing the input and output tensors by name is not very convenient. Indeed you can avoid all names in the following way:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_gpu = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "W_gpu = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ") as graph:\n",
+    "    Y = graph.conv_fprop(\n",
+    "        X_gpu,\n",
+    "        weight=W_gpu,\n",
+    "        padding=[1, 1],\n",
+    "        stride=[1, 1],\n",
+    "        dilation=[1, 1],\n",
+    "    )\n",
+    "    Y.set_output(True)\n",
+    "\n",
+    "# specify the input and output tensors using tensor objects\n",
+    "graph.set_io_tuples([W_gpu, X_gpu], [Y])\n",
+    "Y_gpu = graph(W_gpu, X_gpu, handle=handle)\n",
+    "\n",
+    "Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=1)\n",
+    "torch.testing.assert_close(Y_gpu, Y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the `set_io_tuples()` call, you set the input tenors to be `X_gpu` and `W_gpu`. These two tensors are what you used to create the convolution node in the graph. The wrapper remembers them and you can reference them using the exact object. Note that the wrapper identifies the object by its Python object ID. Therefore, you must use the same object that you used to create the graph.\n",
+    "\n",
+    "Also note that the order of the input tensors in `set_io_tuples()` above is different from the previous examples. It is to show that the order can be arbitrary in `set_io_tuples()`, but once you set it, you must follow the same order when you execute the graph.\n",
+    "\n",
+    "Finally, it is worth noting that once you created the graph, you can execute it for multiple times with different inputs. It is not necessary to create a new graph object for the new inputs as long as the operation is the same (not only the graph topology, but also the input and output tensors' size, layout, and data type)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using wrapper in larger codebases"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "From the examples above, you can see that the `Graph` object is the centerpiece of for you to interact with cuDNN. It is a Python object and you can make use of it in your larger codebase. Below is an example on how to create a factory function for a graph with cache facility:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cache = {}  # global cache for the decorator\n",
+    "\n",
+    "\n",
+    "def cached_graph_factory(func):\n",
+    "    \"\"\"Decorator for the factory function to cache Graph objects.\"\"\"\n",
+    "\n",
+    "    def wrapper(*args):\n",
+    "        key = [func.__name__]\n",
+    "        for arg in args:\n",
+    "            key.extend([arg.shape, arg.stride(), arg.dtype])\n",
+    "        key = tuple(key)\n",
+    "        if key not in cache:\n",
+    "            print(\"Creating new graph:\", key)\n",
+    "            cache[key] = func(*args)\n",
+    "        return cache[key]\n",
+    "\n",
+    "    return wrapper\n",
+    "\n",
+    "\n",
+    "@cached_graph_factory\n",
+    "def conv(x, w):\n",
+    "    with cudnn.Graph(\n",
+    "        io_data_type=cudnn.data_type.HALF,\n",
+    "        compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    ) as graph:\n",
+    "        Y = graph.conv_fprop(\n",
+    "            image=x,\n",
+    "            weight=w,\n",
+    "            padding=[1, 1],\n",
+    "            stride=[1, 1],\n",
+    "            dilation=[1, 1],\n",
+    "            name=\"conv\",\n",
+    "        )\n",
+    "        Y.set_output(True)\n",
+    "\n",
+    "    graph.set_io_tuples([\"conv::image\", \"conv::weight\"], [\"conv::Y\"])\n",
+    "    return graph\n",
+    "\n",
+    "\n",
+    "def call_conv(x, w):\n",
+    "    g = conv(x, w)\n",
+    "    return g(x, w, handle=handle)\n",
+    "\n",
+    "\n",
+    "# Call it the first time\n",
+    "print(\"First call\")\n",
+    "x1 = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "w1 = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "y1 = call_conv(x1, w1)\n",
+    "\n",
+    "# Call it the second time with argument of the same shape\n",
+    "print(\"Second call\")\n",
+    "x2 = torch.randn(8, 64, 56, 56, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "w2 = torch.randn(32, 64, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "y2 = call_conv(x2, w2)\n",
+    "\n",
+    "# Call it the third time with argument of different shape\n",
+    "print(\"Third call\")\n",
+    "x3 = torch.randn(8, 20, 60, 60, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "w3 = torch.randn(30, 20, 3, 3, device=device, dtype=torch.float16).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "y3 = call_conv(x3, w3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When you run the code above, you will see that the first call and third call created a new graph from the decorator because of the message printed. The second call did not create any graph but returned the same graph object instead because the cache key has found a match. Remember that the cuDNN graph can be reused for the same inputs and outputs. Therefore, you do not need a new graph for the second call. This caching facility helps you to check if the graph can be reused.\n",
+    "\n",
+    "This is just an example how you can use the `Graph` object in a more sophisticated codebase."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/02_low_level_api.ipynb b/third_party/cudnn-frontend/samples/python/02_low_level_api.ipynb
new file mode 100644
index 00000000..a06597ef
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/02_low_level_api.ipynb
@@ -0,0 +1,532 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Overview of the Lower Level Python Binding API\n",
+    "\n",
+    "This notebook explains the features and capabilities of the Python binding for cuDNN."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/02_low_level_api.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plain Syntax"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the [introduction](00_introduction.ipynb), you saw how to create a graph and execute it with a Python binding directly. Let's see a different example of the same workflow:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\")\n",
+    "dtype = torch.float16\n",
+    "B, M, N, K = 16, 128, 128, 512\n",
+    "\n",
+    "# input tensors\n",
+    "a_gpu = torch.randn(B, M, K, device=device, dtype=dtype)\n",
+    "b_gpu = torch.randn(B, K, N, device=device, dtype=dtype)\n",
+    "d_gpu = torch.randn(1, M, N, device=device, dtype=dtype)\n",
+    "\n",
+    "# place holder for cuDNN output\n",
+    "c_gpu = torch.empty(B, M, N, device=device, dtype=dtype)\n",
+    "\n",
+    "# reference output\n",
+    "c_ref = torch.matmul(a_gpu, b_gpu) + d_gpu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a handle and construct the graph\n",
+    "handle = cudnn.create_handle()\n",
+    "graph = cudnn.pygraph(handle=handle, compute_data_type=cudnn.data_type.FLOAT)\n",
+    "\n",
+    "a_cudnn = graph.tensor_like(a_gpu)\n",
+    "b_cudnn = graph.tensor_like(b_gpu)\n",
+    "d_cudnn = graph.tensor_like(d_gpu)\n",
+    "\n",
+    "ab = graph.matmul(name=\"mm\", A=a_cudnn, B=b_cudnn)\n",
+    "ab.set_data_type(cudnn.data_type.HALF)\n",
+    "c_cudnn = graph.bias(name=\"bias\", input=ab, bias=d_cudnn)\n",
+    "c_cudnn.set_output(True).set_data_type(cudnn.data_type.HALF)\n",
+    "\n",
+    "# execute the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=device, dtype=torch.uint8)\n",
+    "variant_pack = {\n",
+    "    a_cudnn: a_gpu,  # input\n",
+    "    b_cudnn: b_gpu,  # input\n",
+    "    d_cudnn: d_gpu,  # input\n",
+    "    c_cudnn: c_gpu,  # output\n",
+    "}\n",
+    "graph.execute(variant_pack, workspace, handle=handle)\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This computes a batched matrix multiplication with bias: $C = A \\times B + D$. Tensor $A$ is a batch of $M\\times K$ matrices and tensor $B$ is a batch of $K\\times N$ matrices. Batch size is $B$. A constant bias matrix $D$ of size $M\\times N$ is added to each element of the output matrix $C$. The result $C$ is a batch of $M\\times N$ matrices.\n",
+    "\n",
+    "You created two nodes in the graph for this computation: One for the matrix-matrix multiplication and one for the bias addition. The variable `ab` is an intermediate tensor (i.e., virtual tensor) from the matrix-matrix multiplication and used as input to the bias addition. You need to set the data type of the intermediate tensor because the matrix-matrix multiplication node can output different data types.\n",
+    "\n",
+    "The graph output is `c_cudnn`. You mark it as non-virtual tensor by calling `set_output(True)`. You execute the graph by providing `variant_pack` as a dictionary mapping the cuDNN tensors you used in the graph to the allocated tensors you created with PyTorch. The output tensors will be updated in place when the graph executes.\n",
+    "\n",
+    "You can remove the `set_data_type()` calls to the intermediate tensor `ab` and output tensor `c_cudnn` if you set up the default in the `pygraph` object:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a handle and construct the graph\n",
+    "handle = cudnn.create_handle()\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "a_cudnn = graph.tensor_like(a_gpu)\n",
+    "b_cudnn = graph.tensor_like(b_gpu)\n",
+    "d_cudnn = graph.tensor_like(d_gpu)\n",
+    "\n",
+    "ab = graph.matmul(name=\"mm\", A=a_cudnn, B=b_cudnn)\n",
+    "c_cudnn = graph.bias(name=\"bias\", input=ab, bias=d_cudnn)\n",
+    "c_cudnn.set_output(True)\n",
+    "\n",
+    "# execute the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=device, dtype=torch.uint8)\n",
+    "variant_pack = {\n",
+    "    a_cudnn: a_gpu,  # input\n",
+    "    b_cudnn: b_gpu,  # input\n",
+    "    d_cudnn: d_gpu,  # input\n",
+    "    c_cudnn: c_gpu,  # output\n",
+    "}\n",
+    "graph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using decorators"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also use cuDNN as a decorator. The major benefit is that you can make the build step implicit. See below for an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def matmul_cache_key(handle, a, b, bias):\n",
+    "    \"\"\"Custom key function for matmul + bias\"\"\"\n",
+    "    return (\n",
+    "        tuple(a.shape),\n",
+    "        tuple(b.shape),\n",
+    "        tuple(a.stride()),\n",
+    "        tuple(b.stride()),\n",
+    "        a.dtype,\n",
+    "        b.dtype,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.B])\n",
+    "@cudnn.graph_cache(key_fn=matmul_cache_key)\n",
+    "def create_matmul_bias_graph(handle, a, b, bias):\n",
+    "    with cudnn.graph(handle) as (g, _):\n",
+    "        a_cudnn = g.tensor_like(a)\n",
+    "        b_cudnn = g.tensor_like(b)\n",
+    "        bias_cudnn = g.tensor_like(bias)\n",
+    "        c_cudnn = g.matmul(name=\"matmul\", A=a_cudnn, B=b_cudnn)\n",
+    "        out = g.bias(name=\"bias\", input=c_cudnn, bias=bias_cudnn)\n",
+    "        out.set_output(True).set_data_type(cudnn.data_type.HALF)\n",
+    "\n",
+    "    return g, [a_cudnn, b_cudnn, bias_cudnn, out]  # Return raw graph and tensors\n",
+    "\n",
+    "\n",
+    "g, uids = create_matmul_bias_graph(handle, a_gpu, b_gpu, d_gpu)\n",
+    "a_uid, b_uid, bias_uid, out_uid = uids\n",
+    "\n",
+    "variant_pack = {\n",
+    "    a_uid: a_gpu,\n",
+    "    b_uid: b_gpu,\n",
+    "    bias_uid: d_gpu,\n",
+    "    out_uid: c_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(g.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "g.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You see that in this example, you did not call `graph.build()`. Instead, you defined a function `create_matmul_bias_graph()` that returns a graph object and a list of cuDNN tensors. This part is same as the previous example. However, you decorated the function with `@cudnn.jit` to specify the heuristic modes to use to build the graph.\n",
+    "\n",
+    "You also decorated the function with `@cudnn.graph_cache` to specify a custom key function for the graph cache. The custom key function `matmul_cache_key()` depends on the shape, stride, and data type of the input tensors, but not the other attributes or the handle. This way, you can call this line multiple times without rebuilding the graph:\n",
+    "\n",
+    "```python\n",
+    "g, uids = create_matmul_bias_graph(handle, a_gpu, b_gpu, d_gpu)\n",
+    "```\n",
+    "\n",
+    "Note that when you called `create_matmul_bias_graph()`, you pass in a handle and multiple PyTorch tensors. The data held by the tensors are not important, as long as the data types and layouts are the same, the same graph will be returned by the cache. This could be a convenience for you because logically, this is a matrix multiplication with bias but the graph to use on cuDNN would be different depends on the floating point precision and the dimension of the input tensors. The graph cache helps you to keep track on the graph to use.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Building the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the first example above, you saw that to prepare a graph for execution, you need to call `graph.build()` and pass in a list of heuristic modes. This is indeed a meta-function that combines multiple steps into one. Below is an example to break down the steps:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a handle and construct the graph\n",
+    "handle = cudnn.create_handle()\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "a_cudnn = graph.tensor_like(a_gpu)\n",
+    "b_cudnn = graph.tensor_like(b_gpu)\n",
+    "d_cudnn = graph.tensor_like(d_gpu)\n",
+    "\n",
+    "ab = graph.matmul(name=\"mm\", A=a_cudnn, B=b_cudnn)\n",
+    "c_cudnn = graph.bias(name=\"bias\", input=ab, bias=d_cudnn)\n",
+    "c_cudnn.set_output(True)\n",
+    "\n",
+    "# build and validate the graph\n",
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()\n",
+    "\n",
+    "# execute the graph\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=device, dtype=torch.uint8)\n",
+    "variant_pack = {\n",
+    "    a_cudnn: a_gpu,  # input\n",
+    "    b_cudnn: b_gpu,  # input\n",
+    "    d_cudnn: d_gpu,  # input\n",
+    "    c_cudnn: c_gpu,  # output\n",
+    "}\n",
+    "graph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example, you break down the build process into multiple steps. This can give you more control over the actual graph execution plan."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Serialization and Deserialization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Graph created can be serialized and deserialized. This can save the overhead of rebuilding the graph from scratch. Below is an example of how to serialize a graph:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a handle and construct the graph\n",
+    "handle = cudnn.create_handle()\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "a_cudnn = graph.tensor_like(a_gpu)\n",
+    "b_cudnn = graph.tensor_like(b_gpu)\n",
+    "d_cudnn = graph.tensor_like(d_gpu)\n",
+    "a_cudnn.set_uid(0)\n",
+    "b_cudnn.set_uid(1)\n",
+    "d_cudnn.set_uid(2)\n",
+    "\n",
+    "ab = graph.matmul(name=\"mm\", A=a_cudnn, B=b_cudnn)\n",
+    "c_cudnn = graph.bias(name=\"bias\", input=ab, bias=d_cudnn)\n",
+    "c_cudnn.set_output(True).set_uid(3)\n",
+    "\n",
+    "# validate the graph and serialize it\n",
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()\n",
+    "serialized_data = graph.serialize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the above, you created a graph, build the execution plan, and serialized it. The serialized data is a list of integers representing a byte stream. You can save the serialized data and reuse it later. Here is how you can deserialize the graph and execute it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create new graph from serialized data\n",
+    "newgraph = cudnn.pygraph()\n",
+    "newgraph.deserialize(serialized_data)\n",
+    "\n",
+    "# execute the graph\n",
+    "workspace = torch.empty(newgraph.get_workspace_size(), device=device, dtype=torch.uint8)\n",
+    "variant_pack = {\n",
+    "    0: a_gpu,  # input\n",
+    "    1: b_gpu,  # input\n",
+    "    2: d_gpu,  # input\n",
+    "    3: c_gpu,  # output\n",
+    "}\n",
+    "newgraph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# verify the result\n",
+    "c_ref = torch.matmul(a_gpu, b_gpu) + d_gpu\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the above, you created a new `pygraph` object and populated it with the serialized data. Then you can execute the graph immediately without validation or building execution plan.\n",
+    "\n",
+    "You can see that this example has one major difference from the previous example: The cuDNN tensors involved are set with a particular UID. All tensors will be assigned a UID when the corresponding graph is built, but you can manually set the UID with `set_uid()` as long as the UID are unique. The reason you need to set the UID in this example is that the original `graph` object and the new `newgraph` object are distinct. To pass on the `variant_pack` to execute the new graph, you need to reference to the tensors in the new graph, which is not possible since you created the graph by deserialization. Therefore, you need to use tensors' UIDs instead.\n",
+    "\n",
+    "Let's see another example: A graph of a single SDPA operation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = 2  # batch size\n",
+    "s_q = 1024  # query sequence length\n",
+    "s_kv = 1024  # key+value sequence length\n",
+    "h = 6  # query heads\n",
+    "d = 64  # query+key embedding dimension per head\n",
+    "attn_scale = 1 / d**0.5\n",
+    "dtype = torch.bfloat16\n",
+    "\n",
+    "shape_q = (b, h, s_q, d)\n",
+    "shape_k = shape_v = (b, h, s_kv, d)\n",
+    "shape_o = (b, h, s_q, d)\n",
+    "\n",
+    "stride_q = stride_o = (s_q * h * d, d, h * d, 1)\n",
+    "stride_k = stride_v = (s_kv * h * d, d, h * d, 1)\n",
+    "\n",
+    "# allocate PyTorch tensors as input and output\n",
+    "q_gpu = torch.randn(b * h * s_q * d, dtype=dtype, device=\"cuda\").as_strided(\n",
+    "    shape_q, stride_q\n",
+    ")\n",
+    "k_gpu = torch.randn(b * h * s_kv * d, dtype=dtype, device=\"cuda\").as_strided(\n",
+    "    shape_k, stride_k\n",
+    ")\n",
+    "v_gpu = torch.randn(b * h * s_kv * d, dtype=dtype, device=\"cuda\").as_strided(\n",
+    "    shape_v, stride_v\n",
+    ")\n",
+    "o_gpu = torch.empty(b * h * s_q * d, dtype=dtype, device=\"cuda\").as_strided(\n",
+    "    shape_o, stride_o\n",
+    ")\n",
+    "stats_gpu = torch.empty(b, h, s_q, 1, dtype=dtype, device=\"cuda\")\n",
+    "\n",
+    "# define a graph\n",
+    "graph = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.BFLOAT16,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    handle=handle,\n",
+    ")\n",
+    "\n",
+    "q = graph.tensor_like(q_gpu)\n",
+    "k = graph.tensor_like(k_gpu)\n",
+    "v = graph.tensor_like(v_gpu)\n",
+    "\n",
+    "o, stats = graph.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q,\n",
+    "    k=k,\n",
+    "    v=v,\n",
+    "    generate_stats=True,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "q.set_uid(0)\n",
+    "k.set_uid(1)\n",
+    "v.set_uid(2)\n",
+    "o.set_uid(3)\n",
+    "stats.set_uid(4)\n",
+    "o.set_output(True).set_dim(shape_o).set_stride(stride_o)\n",
+    "stats.set_output(True).set_data_type(cudnn.data_type.BFLOAT16)\n",
+    "\n",
+    "# serialize the graph\n",
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()\n",
+    "serialized_data = graph.serialize()\n",
+    "\n",
+    "# deserialize the graph\n",
+    "deserialized_graph = cudnn.pygraph()\n",
+    "deserialized_graph.deserialize(serialized_data)\n",
+    "\n",
+    "# execute the graph\n",
+    "workspace = torch.empty(\n",
+    "    deserialized_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "variant_pack = {\n",
+    "    0: q_gpu,\n",
+    "    1: k_gpu,\n",
+    "    2: v_gpu,\n",
+    "    3: o_gpu,\n",
+    "    4: stats_gpu,\n",
+    "}\n",
+    "deserialized_graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# verify the results\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "    q_gpu, k_gpu, v_gpu, is_causal=True, scale=attn_scale\n",
+    ")\n",
+    "torch.testing.assert_close(o_ref, o_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/10_matrix_multiplication.ipynb b/third_party/cudnn-frontend/samples/python/10_matrix_multiplication.ipynb
new file mode 100644
index 00000000..5c78107e
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/10_matrix_multiplication.ipynb
@@ -0,0 +1,347 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Matrix Multiplication\n",
+    "\n",
+    "This notebook demonstrates how to use cuDNN's matrix multiplication API."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/10_matrix_multiplication.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Matrix Multiplication with Bias"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an example shown in [a previous notebook](02_binding.ipynb). It's repeated here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"cuda\")\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "dtype = torch.float16\n",
+    "B, M, N, K = 16, 128, 128, 512\n",
+    "\n",
+    "# input tensors\n",
+    "a_gpu = torch.randn(B, M, K, device=device, dtype=dtype)\n",
+    "b_gpu = torch.randn(B, K, N, device=device, dtype=dtype)\n",
+    "d_gpu = torch.randn(1, M, N, device=device, dtype=dtype)\n",
+    "\n",
+    "# reference output\n",
+    "c_ref = torch.matmul(a_gpu, b_gpu) + d_gpu"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is using the `Graph` wrapper to create a graph for matrix multiplication. The graph is executed as a function after the input and output order is specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"mm::A\", \"mm::B\", \"bias::bias\"],\n",
+    "    outputs=[\"bias::OUT_0\"],\n",
+    ") as graph:\n",
+    "    AB = graph.matmul(\n",
+    "        name=\"mm\",\n",
+    "        A=a_gpu,\n",
+    "        B=b_gpu,\n",
+    "    )\n",
+    "    C = graph.bias(name=\"bias\", input=AB, bias=d_gpu)\n",
+    "    C.set_output(True)\n",
+    "\n",
+    "c_gpu = graph(a_gpu, b_gpu, d_gpu, handle=handle)\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using Python binding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is the equivalent using the Python binding APIs. It is more verbose."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a handle and construct the graph\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "a_cudnn = graph.tensor_like(a_gpu)\n",
+    "b_cudnn = graph.tensor_like(b_gpu)\n",
+    "d_cudnn = graph.tensor_like(d_gpu)\n",
+    "\n",
+    "ab = graph.matmul(name=\"mm\", A=a_cudnn, B=b_cudnn)\n",
+    "c_cudnn = graph.bias(name=\"bias\", input=ab, bias=d_cudnn)\n",
+    "c_cudnn.set_output(True)\n",
+    "\n",
+    "# build and validate the graph\n",
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()\n",
+    "\n",
+    "# place holder for cuDNN output\n",
+    "c_gpu = torch.empty(B, M, N, device=device, dtype=dtype)\n",
+    "\n",
+    "# execute the graph\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=device, dtype=torch.uint8)\n",
+    "variant_pack = {\n",
+    "    a_cudnn: a_gpu,  # input\n",
+    "    b_cudnn: b_gpu,  # input\n",
+    "    d_cudnn: d_gpu,  # input\n",
+    "    c_cudnn: c_gpu,  # output\n",
+    "}\n",
+    "graph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Matrix Multiplication with Mixed Precision"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Matrix multiplication operation is supposed to be done in the same precision. To multiply two matrices of different precisions, you need to cast the precision first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "B, M, N, K = 16, 128, 128, 512\n",
+    "\n",
+    "a_gpu = torch.randint(4, (B, M, K), device=device, dtype=torch.int8)\n",
+    "b_gpu = torch.randn(B, K, N, device=device, dtype=torch.bfloat16)\n",
+    "\n",
+    "c_ref = torch.matmul(a_gpu.to(torch.bfloat16), b_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see how to create a graph for matrix multiplication with mixed precision. It involves two nodes: one for precision casting and another for actual matrix multiplication."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"iden::input\", \"mm::B\"],\n",
+    "    outputs=[\"mm::C\"],\n",
+    ") as graph:\n",
+    "    # cast \"A\" to same data type as \"B\"\n",
+    "    a_casted = graph.identity(\n",
+    "        name=\"iden\", input=a_gpu, compute_data_type=cudnn.data_type.FLOAT\n",
+    "    )\n",
+    "    a_casted.set_data_type(torch.bfloat16)\n",
+    "    # matmul with two tensors of same data type\n",
+    "    c = graph.matmul(\n",
+    "        name=\"mm\", A=a_casted, B=b_gpu, compute_data_type=cudnn.data_type.FLOAT\n",
+    "    )\n",
+    "    c.set_output(True).set_data_type(torch.bfloat16)\n",
+    "\n",
+    "c_gpu = graph(a_gpu, b_gpu, handle=handle)\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using Python binding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The equivalent code using the Python binding APIs is as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a graph\n",
+    "graph = cudnn.pygraph()\n",
+    "\n",
+    "a_cudnn = graph.tensor_like(a_gpu)\n",
+    "b_cudnn = graph.tensor_like(b_gpu)\n",
+    "\n",
+    "a_casted = graph.identity(\n",
+    "    input=a_cudnn,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "a_casted.set_data_type(cudnn.data_type.BFLOAT16)\n",
+    "c_cudnn = graph.matmul(\n",
+    "    name=\"matmul\",\n",
+    "    A=a_casted,\n",
+    "    B=b_cudnn,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "c_cudnn.set_output(True).set_data_type(cudnn.data_type.BFLOAT16)\n",
+    "\n",
+    "# validate and build\n",
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()\n",
+    "\n",
+    "# execute the graph\n",
+    "c_gpu = torch.randn(B, M, N, device=device, dtype=torch.bfloat16)\n",
+    "variant_pack = {\n",
+    "    a_cudnn: a_gpu,\n",
+    "    b_cudnn: b_gpu,\n",
+    "    c_cudnn: c_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# verify the result\n",
+    "torch.testing.assert_close(c_gpu, c_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/20_layernorm_forward.ipynb b/third_party/cudnn-frontend/samples/python/20_layernorm_forward.ipynb
new file mode 100644
index 00000000..c07b5f87
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/20_layernorm_forward.ipynb
@@ -0,0 +1,460 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LayerNorm Forward Operation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to compute a layernorm forward operation using cuDNN.\n",
+    "\n",
+    "$$\\text{LayerNorm}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot\\gamma+\\beta$$\n",
+    "\n",
+    "Where $\\mu = E[x]$ and $\\sigma^2 = Var[x]$ are taken over all inputs in a batch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/20_layernorm_forward.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the layer norm forward pass with the following problem sizes:\n",
+    "\n",
+    "- batch size: 4\n",
+    "- sequence length: 1024\n",
+    "- embedding dimension: 768\n",
+    "\n",
+    "The tensor will be in 16-bit float format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "dtype = torch.float16\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is how you can use the `Graph` wrapper to perform layer norm with the input tensors in PyTorch format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "bias_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# create a graph: output in half precision, but mean and inv_var in single precision\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"ln::input\", \"ln::scale\", \"ln::bias\", \"ln::epsilon\"],\n",
+    "    outputs=[\"ln::Y\", \"ln::MEAN\", \"ln::INV_VARIANCE\"],\n",
+    ") as graph:\n",
+    "    out, mean, inv_var = graph.layernorm(\n",
+    "        name=\"ln\",\n",
+    "        input=x_gpu,\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    out.set_output(True).set_data_type(dtype)\n",
+    "    mean.set_output(True).set_data_type(torch.float32)\n",
+    "    inv_var.set_output(True).set_data_type(torch.float32)\n",
+    "\n",
+    "# execute the graph and retrieve the output tensors\n",
+    "out_gpu, mean_gpu, inv_var_gpu = graph(\n",
+    "    x_gpu, scale_gpu, bias_gpu, eps_cpu, handle=handle\n",
+    ")\n",
+    "\n",
+    "# verify the result with PyTorch operations\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "mean_ref = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### General Setup\n",
+    "Create a cudnn handle, which is a per device handle used to initialize cudnn context."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LayerNorm Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    "    device=\"cuda\",\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, dtype=dtype, requires_grad=True, device=\"cuda\"\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, dtype=dtype, requires_grad=True, device=\"cuda\"\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "\n",
+    "# Epsilon must be a scalar value on the cpu.\n",
+    "epsilon_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), epsilon_value, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compute reference ouputs and allocate output tensor GPU buffers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the reference computation outputs here before the cuDNN computation, in order to use .empty_like() to create our output buffers\n",
+    "out_expected = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "\n",
+    "mean_expected = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)\n",
+    "\n",
+    "inv_var_expected = torch.rsqrt(\n",
+    "    torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "# Allocate output tensor memory using PyTorch\n",
+    "# PyTorch has calculated their shapes already, so we can simply use .empty_like()\n",
+    "out_gpu = torch.empty_like(out_expected)\n",
+    "mean_gpu = torch.empty_like(mean_expected)\n",
+    "inv_var_gpu = torch.empty_like(inv_var_expected)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create cuDNN graph\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we assign UIDs for tensors. UIDs are a unique identifier that will allow us to provide a mapping from tensors created from cuDNN graph api calls, such as `graph.tensor_like()`, to the underlying device memory that will be used to store these tensors. Virtual tensors don't require explicit memory allocated for them, but non-vritual tensors like inputs or outputs will need to have UIDs assigned to them. \n",
+    "\n",
+    "Alternatively, one can use handles directly in the mapping, however using UIDs can be more convinient for caching of cuDNN graphs.\n",
+    "\n",
+    "For each of our inputs {X, Scale, Bias, Epsilon} and our outputs {Out, Mean, Inverse Variance}, we allocate a UID. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "\n",
+    "\n",
+    "class UID(Enum):\n",
+    "    X = 0\n",
+    "    SCALE = 1\n",
+    "    BIAS = 2\n",
+    "    EPSILON = 3\n",
+    "    OUT = 4\n",
+    "    MEAN = 5\n",
+    "    INV_VAR = 6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph.\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the graph API, assign UIDs.\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\").set_uid(UID.X.value)\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\").set_uid(UID.SCALE.value)\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\").set_uid(UID.BIAS.value)\n",
+    "epsilon = graph.tensor_like(epsilon_cpu).set_name(\"epsilon\").set_uid(UID.EPSILON.value)\n",
+    "\n",
+    "# Add a layernorm operation\n",
+    "(out, mean, inv_var) = graph.layernorm(\n",
+    "    name=\"layernorm\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Enable all outputs, by default outputs are disabled\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(out_expected.dtype).set_uid(\n",
+    "    UID.OUT.value\n",
+    ")\n",
+    "mean.set_name(\"mean\").set_output(True).set_data_type(mean_expected.dtype).set_uid(\n",
+    "    UID.MEAN.value\n",
+    ")\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(\n",
+    "    inv_var_expected.dtype\n",
+    ").set_uid(UID.INV_VAR.value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "# To run this block more than once, we need to re-run the previous block to get a new graph.\n",
+    "# The same instance of a graph can not be built twice."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After validating and building a cuDNN graph,  we can now execute it. To do this, we have to provide input and output buffers. We do this by using the previously allocated UIDs to associate between tensor handles generated from the graph API, and their underlying memory. \n",
+    "\n",
+    "The desired input values need to be stored in these buffers before the `graph.execute` call. Because we have done a reference computation, we can simply reuse the buffers we have allocated via PyTorch.\n",
+    "\n",
+    "Note that the EPISLON UID expects a cpu buffer, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping of (UIDs -> memory)\n",
+    "variant_pack = {\n",
+    "    UID.X.value: x_gpu,\n",
+    "    UID.SCALE.value: scale_gpu,\n",
+    "    UID.BIAS.value: bias_gpu,\n",
+    "    UID.EPSILON.value: epsilon_cpu,\n",
+    "    UID.OUT.value: out_gpu,\n",
+    "    UID.MEAN.value: mean_gpu,\n",
+    "    UID.INV_VAR.value: inv_var_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(out_gpu, out_expected, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_expected, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_expected, rtol=5e-3, atol=5e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/21_layernorm_backward.ipynb b/third_party/cudnn-frontend/samples/python/21_layernorm_backward.ipynb
new file mode 100644
index 00000000..c6834236
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/21_layernorm_backward.ipynb
@@ -0,0 +1,643 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LayerNorm Backward Operation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook shows how the backwards pass of a layernorm operation can be done using cudnn.\n",
+    "\n",
+    "$$\\text{LayerNorm}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot\\gamma+\\beta$$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/21_layernorm_backward.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the layer norm forward and backward pass with the following problem sizes:\n",
+    "\n",
+    "- batch size: 4\n",
+    "- sequence length: 1024\n",
+    "- embedding dimension: 768\n",
+    "\n",
+    "The tensor will be in 16-bit float format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "dtype = torch.float16\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is how you can use the `Graph` wrapper to perform forward layer norm with the input tensors in PyTorch format, then compute a loss metric and perform the backward pass.\n",
+    "\n",
+    "First is the forward pass, which is mostly the same as the one in the [forward pass notebook](20_layernorm_forward.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# create a graph: output in half precision, but mean and inv_var in single precision\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"ln::input\", \"ln::scale\", \"ln::bias\", \"ln::epsilon\"],\n",
+    "    outputs=[\"ln::Y\", \"ln::MEAN\", \"ln::INV_VARIANCE\"],\n",
+    ") as graph:\n",
+    "    out, mean, inv_var = graph.layernorm(\n",
+    "        name=\"ln\",\n",
+    "        input=x_gpu,\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    out.set_output(True).set_data_type(dtype)\n",
+    "    mean.set_output(True).set_data_type(torch.float32)\n",
+    "    inv_var.set_output(True).set_data_type(torch.float32)\n",
+    "\n",
+    "# execute the graph and retrieve the output tensors\n",
+    "out_gpu, mean_gpu, inv_var_gpu = graph(\n",
+    "    x_gpu, scale_gpu, bias_gpu, eps_cpu, handle=handle\n",
+    ")\n",
+    "\n",
+    "# verify the result with PyTorch operations\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "mean_ref = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we generate a random tensor and assume that is the ground truth output. Then compute the L2 loss for backward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute gradients: Ask PyTorch not to discard the grads after use so that we can read it twice\n",
+    "# out_ref.grad will be used in the cudnn graph, x_gpu.grad, scale_gpu.grad, and bias_gpu.grad will\n",
+    "# be used to compare with the cudnn graph output.\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, let's do the backward pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Backward pass\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"ln_bwd::grad\",\n",
+    "        \"ln_bwd::input\",\n",
+    "        \"ln_bwd::scale\",\n",
+    "        \"ln_bwd::mean\",\n",
+    "        \"ln_bwd::inv_variance\",\n",
+    "    ],\n",
+    "    outputs=[\"ln_bwd::DX\", \"ln_bwd::DSCALE\", \"ln_bwd::DBIAS\"],\n",
+    ") as bwd_graph:\n",
+    "    dx, dscale, dbias = bwd_graph.layernorm_backward(\n",
+    "        name=\"ln_bwd\",\n",
+    "        grad=out_ref,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        mean=mean_gpu,\n",
+    "        inv_variance=inv_var_gpu,\n",
+    "    )\n",
+    "    dx.set_output(True).set_data_type(dtype)\n",
+    "    dscale.set_output(True).set_data_type(dtype)\n",
+    "    dbias.set_output(True).set_data_type(dtype)\n",
+    "\n",
+    "# execute the graph and retrieve the output tensors\n",
+    "dx_gpu, dscale_gpu, dbias_gpu = bwd_graph(\n",
+    "    out_ref.grad, x_gpu, scale_gpu, mean_gpu, inv_var_gpu, handle=handle\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(x_gpu.grad, dx_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(scale_gpu.grad, dscale_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(bias_gpu.grad, dbias_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The input `grad` to the backward graph is the gradient of layer norm output computed by PyTorch triggered by `loss.backward()` before. When you run the graph with PyTorch tensors, you use `detach()` to detach the tensors from the PyTorch computational graph. The output from the graph are compared against the corresponding gradients from PyTorch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### LayerNorm Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "# set epsilon to epsilon_value, allocate on cpu.\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create reference computation and output tensor GPU buffers using PyTorch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create the reference computation outputs here before the cuDNN computation, in order to use .empty_like() to create our output buffers\n",
+    "out_expected = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "\n",
+    "mean_expected = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)\n",
+    "\n",
+    "inv_var_expected = torch.rsqrt(\n",
+    "    torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "# allocate output tensor memory using PyTorch\n",
+    "out_gpu = torch.empty_like(out_expected)\n",
+    "mean_gpu = torch.empty_like(mean_expected)\n",
+    "inv_var_gpu = torch.empty_like(inv_var_expected)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create cuDNN Foward Graph and tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph\n",
+    "fwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the fwd_graph\n",
+    "x = fwd_graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = fwd_graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = fwd_graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = fwd_graph.tensor_like(eps_cpu).set_name(\"epsilon\")\n",
+    "\n",
+    "# Add a layernorm operation\n",
+    "(out, mean, inv_var) = fwd_graph.layernorm(\n",
+    "    name=\"layernorm\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Enable all outputs\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(out_expected.dtype)\n",
+    "mean.set_name(\"mean\").set_output(True).set_data_type(mean_expected.dtype)\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(inv_var_expected.dtype);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Validate and build the forward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the fwd_graph\n",
+    "fwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "# To run this block more than once, we need to re-run the previous block to get a new fwd_graph.\n",
+    "# The same instance of a fwd_graph should not be built twice."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the forward graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Instead of mapping UIDs to memory (as in 20_layernorm.ipynb), we can directly map handles to memory. This is simpler but slightly slower to execute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping of (handles -> memory)\n",
+    "variant_pack = {\n",
+    "    x: x_gpu.detach(),\n",
+    "    scale: scale_gpu.detach(),\n",
+    "    bias: bias_gpu.detach(),\n",
+    "    epsilon: eps_cpu,\n",
+    "    out: out_gpu,\n",
+    "    mean: mean_gpu,\n",
+    "    inv_var: inv_var_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(\n",
+    "    fwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "fwd_graph.execute(variant_pack, workspace)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# reference output\n",
+    "torch.testing.assert_close(out_gpu, out_expected, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_expected, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_expected, rtol=5e-3, atol=5e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LayerNorm Backwards Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compute references values for backwards graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reference backwards operation using PyTorch\n",
+    "target = torch.randn_like(out_expected)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_expected, target)\n",
+    "\n",
+    "out_expected.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build backwards graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensors associated with the backwards graph. DO NOT reuse tensor handles from the forward graph.\n",
+    "d_out = bwd_graph.tensor(\n",
+    "    name=\"d_out\", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype\n",
+    ")\n",
+    "\n",
+    "x_bwd = bwd_graph.tensor_like(x, name=\"x\")\n",
+    "scale_bwd = bwd_graph.tensor_like(scale, name=\"scale\")\n",
+    "mean_bwd = bwd_graph.tensor_like(mean, name=\"mean\")\n",
+    "inv_var_bwd = bwd_graph.tensor_like(inv_var, name=\"inv_var\")\n",
+    "\n",
+    "# Add the layernorm backwards operation\n",
+    "(d_x, d_scale, d_bias) = bwd_graph.layernorm_backward(\n",
+    "    name=\"d_layernorm\",\n",
+    "    grad=d_out,\n",
+    "    input=x_bwd,\n",
+    "    scale=scale_bwd,\n",
+    "    mean=mean_bwd,\n",
+    "    inv_variance=inv_var_bwd,\n",
+    ")\n",
+    "\n",
+    "# Enable outputs.\n",
+    "d_x.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_scale.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_bias.set_output(True).set_data_type(x_gpu.dtype)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the bwd_graph\n",
+    "bwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph and check correctness against PyTorch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create output buffers for gradients\n",
+    "d_x_gpu = torch.empty_like(x_gpu)\n",
+    "d_scale_gpu = torch.empty_like(scale_gpu)\n",
+    "d_bias_gpu = torch.empty_like(bias_gpu)\n",
+    "\n",
+    "workspace = torch.empty(\n",
+    "    bwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "\n",
+    "# For the inputs of the backwards graph (x_bwd, d_out, scale_bwd, mean_bwd, inv_var_bwd), we use the outputs of the forwards graph. For d_out we use pytorches autograd .grad functionality.\n",
+    "bwd_graph.execute(\n",
+    "    {\n",
+    "        x_bwd: x_gpu.detach(),\n",
+    "        scale_bwd: scale_gpu.detach(),\n",
+    "        d_out: out_expected.grad,\n",
+    "        mean_bwd: mean_gpu.detach(),\n",
+    "        inv_var_bwd: inv_var_gpu.detach(),\n",
+    "        d_x: d_x_gpu,\n",
+    "        d_scale: d_scale_gpu,\n",
+    "        d_bias: d_bias_gpu,\n",
+    "    },\n",
+    "    workspace,\n",
+    "    handle=handle,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare results and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(scale_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/22_layernorm_inference.ipynb b/third_party/cudnn-frontend/samples/python/22_layernorm_inference.ipynb
new file mode 100644
index 00000000..50b9fc71
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/22_layernorm_inference.ipynb
@@ -0,0 +1,380 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LayerNorm Inference Operation\n",
+    "\n",
+    "This notebook shows how a layernorm operation can be done using cuDNN in inference mode. This is different from the forward pass in training mode that the mean and variance are not computed nor stored.\n",
+    "\n",
+    "$$\\text{LayerNorm}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot\\gamma+\\beta$$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/22_layernorm_inference.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the layer norm forward pass with the following problem sizes:\n",
+    "\n",
+    "- batch size: 4\n",
+    "- sequence length: 1024\n",
+    "- embedding dimension: 768\n",
+    "\n",
+    "The tensor will be in 16-bit float format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "dtype = torch.float16\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is how you can use the `Graph` wrapper to perform forward layer norm with the input tensors in PyTorch.\n",
+    "This is highly similar to the one in the [forward pass notebook](20_layernorm_forward.ipynb)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "bias_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# create a graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"ln::input\", \"ln::scale\", \"ln::bias\", \"ln::epsilon\"],\n",
+    "    outputs=[\"ln::Y\"],\n",
+    ") as graph:\n",
+    "    out, mean, inv_var = graph.layernorm(\n",
+    "        name=\"ln\",\n",
+    "        input=x_gpu,\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    out.set_output(True).set_data_type(dtype)\n",
+    "    assert mean is None, \"Expecting mean to be None under inference mode\"\n",
+    "    assert inv_var is None, \"Expecting inv_var to be None under inference mode\"\n",
+    "\n",
+    "# execute the graph and retrieve the output tensors\n",
+    "out_gpu = graph(x_gpu, scale_gpu, bias_gpu, eps_cpu, handle=handle)\n",
+    "\n",
+    "# verify the result with PyTorch operations\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compared to the one in the [forward pass notebook](20_layernorm_forward.ipynb), the argument `norm_forward_phase` is set to `cudnn.norm_forward_phase.INFERENCE` instead of `cudnn.norm_forward_phase.TRAINING`. The result of this is that the mean and variance are not computed nor stored."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### LayerNorm Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them to calculate a reference value."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    "    device=\"cuda\",\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, dtype=dtype, requires_grad=True, device=\"cuda\"\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, dtype=dtype, requires_grad=True, device=\"cuda\"\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "\n",
+    "# Epsilon must be a scalar value on the cpu.\n",
+    "epsilon_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), epsilon_value, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create reference computation and allocate output tensor GPU buffers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the reference computation outputs here so we can use .empty_like() to create our output buffers\n",
+    "out_expected = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# Allocate output tensor memory using PyTorch\n",
+    "out_gpu = torch.empty_like(out_expected)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create cuDNN graph and tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the graph API\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = graph.tensor_like(epsilon_cpu).set_name(\"epsilon\")\n",
+    "\n",
+    "(out, mean, inv_var) = graph.layernorm(\n",
+    "    name=\"layernorm\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,  # Note INFERENCE and not TRAINING\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Enable only the desired output, by default, outputs are disabled\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(out_expected.dtype)\n",
+    "\n",
+    "# Because we have set the norm_forward_phase to INFERENCE, these outputs will be None.\n",
+    "assert mean is None\n",
+    "assert inv_var is None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "# To run this block more than once, we need to re-run the previous block to get a new graph.\n",
+    "# The same instance of a graph should not be built twice."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping of (handles -> memory)\n",
+    "variant_pack = {\n",
+    "    x: x_gpu.detach(),\n",
+    "    scale: scale_gpu.detach(),\n",
+    "    bias: bias_gpu.detach(),\n",
+    "    epsilon: epsilon_cpu,\n",
+    "    out: out_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reference output\n",
+    "torch.testing.assert_close(out_gpu, out_expected, rtol=5e-3, atol=5e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/23_layernorm_with_pointwise_add_fusion.ipynb b/third_party/cudnn-frontend/samples/python/23_layernorm_with_pointwise_add_fusion.ipynb
new file mode 100644
index 00000000..589717aa
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/23_layernorm_with_pointwise_add_fusion.ipynb
@@ -0,0 +1,266 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Layer Norm with Pointwise Add\n",
+    "\n",
+    "This notebook shows how to compute forward pointwise add + layer normalization with intermediate output."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/25_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we will apply layer norm to a tensor of the following shape:\n",
+    "\n",
+    "- Batch Size: 4\n",
+    "- Sequence Size: 1024\n",
+    "- Embedding Dimension: 768\n",
+    "\n",
+    "Let's define these dimensions as constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 91400\n",
+    "), \"LayerNorm pointwise fusion with intermediate output is only supported cuDNN version 9.14.0 or above\"\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "dtype = torch.float32\n",
+    "\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Add and LayerNorm with Intermediate bfloat16 Output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we define the input tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "add_gpu = torch.randn(\n",
+    "    batch * seq_size, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "bias_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "epsilon_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, create the graph for the forward pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.FLOAT,\n",
+    "    intermediate_data_type=cudnn.data_type.BFLOAT16,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ") as fwd_graph:\n",
+    "    # pointwise add operation: x + b\n",
+    "    added_x = fwd_graph.add(\n",
+    "        name=\"Pointwise add\",\n",
+    "        a=x_gpu,\n",
+    "        b=add_gpu,\n",
+    "    )\n",
+    "    # layernorm forward pass\n",
+    "    out, mean, inv_var = fwd_graph.layernorm(\n",
+    "        name=\"LN\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        input=added_x,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=epsilon_cpu,\n",
+    "    )\n",
+    "    # mark the output tensors\n",
+    "    added_x.set_name(\"added_x\").set_output(True).set_data_type(cudnn.data_type.BFLOAT16)\n",
+    "    out.set_name(\"output\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "    mean.set_name(\"mean\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "    inv_var.set_name(\"inv_var\").set_output(True).set_data_type(cudnn.data_type.FLOAT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, execute the graph and compare the output to the reference output from PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocated output tensors\n",
+    "added_x_gpu = torch.empty(\n",
+    "    batch * seq_size, embedding_dim, 1, 1, dtype=torch.bfloat16, device=\"cuda\"\n",
+    ")\n",
+    "out_gpu = torch.empty_like(x_gpu)\n",
+    "mean_gpu = torch.empty(batch * seq_size, 1, 1, 1, dtype=torch.float32, device=\"cuda\")\n",
+    "inv_var_gpu = torch.empty(batch * seq_size, 1, 1, 1, dtype=torch.float32, device=\"cuda\")\n",
+    "\n",
+    "# execute the graph\n",
+    "output = fwd_graph(\n",
+    "    {\n",
+    "        # input tensors\n",
+    "        \"Pointwise add::a\": x_gpu,\n",
+    "        \"Pointwise add::b\": add_gpu,\n",
+    "        \"LN::scale\": scale_gpu,\n",
+    "        \"LN::bias\": bias_gpu,\n",
+    "        \"LN::epsilon\": epsilon_cpu,\n",
+    "        # output tensors\n",
+    "        \"added_x\": added_x_gpu,\n",
+    "        \"output\": out_gpu,\n",
+    "        \"mean\": mean_gpu,\n",
+    "        \"inv_var\": inv_var_gpu,\n",
+    "    },\n",
+    "    handle=handle,\n",
+    ")\n",
+    "\n",
+    "# PyTorch reference forward operation with intermediate bfloat16 output\n",
+    "added_x_ref = torch.add(x_gpu, add_gpu).to(torch.bfloat16)\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    added_x_ref.to(torch.float32),\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "mean_ref = added_x_ref.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(added_x_ref.to(torch.float32), dim=(1, 2, 3), keepdim=True)\n",
+    "    + epsilon_value\n",
+    ")\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(added_x_gpu, added_x_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(out_gpu, out_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, rtol=5e-3, atol=5e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/24_layernorm_zero_centered_gamma_forward_training_and_backward.ipynb b/third_party/cudnn-frontend/samples/python/24_layernorm_zero_centered_gamma_forward_training_and_backward.ipynb
new file mode 100644
index 00000000..e0c85d01
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/24_layernorm_zero_centered_gamma_forward_training_and_backward.ipynb
@@ -0,0 +1,646 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LayerNorm with Zero Centered Gamma: Forward and Backward\n",
+    "\n",
+    "This notebook shows how to compute a zero centered gamma layernorm forward training and backward operation using cuDNN.\n",
+    "\n",
+    "$$\\text{LayerNorm\\_Zero\\_Centered\\_Gamma}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot(1+\\gamma)+\\beta$$\n",
+    "\n",
+    "Where $\\mu = E[x]$ and $\\sigma^2 = Var[x]$ are taken over all inputs in a batch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/23_layernorm_zero_centered_gamma_forward_training_and_backward.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we will apply layer norm to a tensor of the following shape:\n",
+    "\n",
+    "- Batch Size: 4\n",
+    "- Sequence Size: 1024\n",
+    "- Embedding Dimension: 768\n",
+    "\n",
+    "Let's define these dimensions as constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 91000\n",
+    "), \"LayerNorm Zero Centered Gamma operation is only supported cuDNN version 9.10.0 or above\"\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3\n",
+    "dtype = torch.float16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Forward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "gamma_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "one_cpu = torch.ones(1, 1, 1, 1, dtype=torch.float32, device=\"cpu\")\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of layernorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"gamma_plus_one::a\",\n",
+    "        \"gamma_plus_one::b\",\n",
+    "        \"ln_fwd::input\",\n",
+    "        \"ln_fwd::bias\",\n",
+    "        \"ln_fwd::epsilon\",\n",
+    "    ],\n",
+    "    outputs=[\"ln_fwd::Y\", \"ln_fwd::MEAN\", \"ln_fwd::INV_VARIANCE\"],\n",
+    ") as fwd_graph:\n",
+    "    # a pointwise add operation for zero centered gamma + 1\n",
+    "    scale = fwd_graph.add(\n",
+    "        name=\"gamma_plus_one\",\n",
+    "        a=gamma_gpu,\n",
+    "        b=one_cpu,\n",
+    "    )\n",
+    "    # Add a layernorm operation\n",
+    "    out, mean, inv_var = fwd_graph.layernorm(\n",
+    "        name=\"ln_fwd\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    # Enable all outputs, by default outputs are disabled\n",
+    "    # mean and inv_var must be float32 tensors\n",
+    "    out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "    mean.set_name(\"mean\").set_output(True).set_data_type(torch.float32)\n",
+    "    inv_var.set_name(\"inv_var\").set_output(True).set_data_type(torch.float32)\n",
+    "\n",
+    "out_gpu, mean_gpu, inv_var_gpu = fwd_graph(\n",
+    "    gamma_gpu, one_cpu, x_gpu, bias_gpu, eps_cpu, handle=handle\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Layer norm in cuDNN accepts `scale` and `bias` as input arguments. The `scale` is a factor to multiply to the normalized input. If the scale factor is zero centered, you need a conversion before passing it to the layer norm. The node `gamma_plus_one` defined above is such a conversion.\n",
+    "\n",
+    "Now, let's compare the output from cuDNN with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=(1 + gamma_gpu).squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "mean_ref = x_gpu.float().mean(dim=(1, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.float(), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Based on the above output, let's implement the backward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute gradients: Ask PyTorch not to discard the grads after use so that we can read it twice\n",
+    "# out_ref.grad will be used in the cudnn graph, x_gpu.grad, scale_gpu.grad, and bias_gpu.grad will\n",
+    "# be used to compare with the cudnn graph output.\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "gamma_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()\n",
+    "\n",
+    "# Backward pass\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"gamma_plus_one::a\",\n",
+    "        \"gamma_plus_one::b\",\n",
+    "        \"ln_bwd::grad\",\n",
+    "        \"ln_bwd::input\",\n",
+    "        \"ln_bwd::mean\",\n",
+    "        \"ln_bwd::inv_variance\",\n",
+    "    ],\n",
+    "    outputs=[\"ln_bwd::DX\", \"ln_bwd::DSCALE\", \"ln_bwd::DBIAS\"],\n",
+    ") as bwd_graph:\n",
+    "    scale_bwd = bwd_graph.add(\n",
+    "        name=\"gamma_plus_one\",\n",
+    "        a=gamma_gpu,\n",
+    "        b=one_cpu,\n",
+    "    )\n",
+    "    dx, dscale, dbias = bwd_graph.layernorm_backward(\n",
+    "        name=\"ln_bwd\",\n",
+    "        grad=out_ref.grad,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_bwd,\n",
+    "        mean=mean_gpu,\n",
+    "        inv_variance=inv_var_gpu,\n",
+    "    )\n",
+    "    dx.set_output(True).set_data_type(dtype)\n",
+    "    dscale.set_output(True).set_data_type(dtype)\n",
+    "    dbias.set_output(True).set_data_type(dtype)\n",
+    "\n",
+    "dx_gpu, dscale_gpu, dbias_gpu = bwd_graph(\n",
+    "    gamma_gpu, one_cpu, out_ref.grad, x_gpu, mean_gpu, inv_var_gpu, handle=handle\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(x_gpu.grad, dx_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(gamma_gpu.grad, dscale_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(bias_gpu.grad, dbias_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Forward pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we need to create GPU buffers as input, the `gamma_gpu` tensor is the zero-centered scale factor:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gamma_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "one_cpu = torch.ones(1, 1, 1, 1, dtype=torch.float32, device=\"cpu\")\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we can create the graph for forward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "\n",
+    "\n",
+    "class UID(Enum):\n",
+    "    SCALE0 = 1\n",
+    "    X = 2\n",
+    "    BIAS = 3\n",
+    "    OUT = 5\n",
+    "    MEAN = 6\n",
+    "    INV_VAR = 7\n",
+    "    ONE = 8\n",
+    "    EPSILON = 9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph.\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the graph API, assign UIDs.\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\").set_uid(UID.X.value)\n",
+    "gamma = (\n",
+    "    graph.tensor_like(gamma_gpu.detach()).set_name(\"scale0\").set_uid(UID.SCALE0.value)\n",
+    ")\n",
+    "one = graph.tensor_like(one_cpu).set_name(\"one\").set_uid(UID.ONE.value)\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\").set_uid(UID.BIAS.value)\n",
+    "epsilon = graph.tensor_like(eps_cpu).set_name(\"epsilon\").set_uid(UID.EPSILON.value)\n",
+    "\n",
+    "# A node for pointwise add operation: zero centered gamma + 1\n",
+    "scale = graph.add(name=\"gamma_plus_one\", a=gamma, b=one)\n",
+    "\n",
+    "# A node for layernorm operation\n",
+    "out, mean, inv_var = graph.layernorm(\n",
+    "    name=\"layernorm\",\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    input=x,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Mark output tensors\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(dtype).set_uid(UID.OUT.value)\n",
+    "mean.set_name(\"mean\").set_output(True).set_data_type(torch.float32).set_uid(\n",
+    "    UID.MEAN.value\n",
+    ")\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(torch.float32).set_uid(\n",
+    "    UID.INV_VAR.value\n",
+    ")\n",
+    "\n",
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we assign UIDs for tensors. UIDs are a unique identifier that will allow us to provide a mapping from tensors created from cuDNN graph api calls, such as `graph.tensor_like()`, to the underlying device memory that will be used to store these tensors. Virtual tensors don't require explicit memory allocated for them, but non-vritual tensors like inputs or outputs will need to have UIDs assigned to them. \n",
+    "\n",
+    "Alternatively, one can use handles directly in the mapping, however using UIDs can be more convinient for caching of cuDNN graphs.\n",
+    "\n",
+    "For each of our inputs {X, Scale, Bias, Epsilon} and our outputs {Out, Mean, Inverse Variance}, we allocate a UID. \n",
+    "\n",
+    "After validating and building a cuDNN graph,  we can now execute it. To do this, we have to provide input and output buffers. We do this by using the previously allocated UIDs to associate between tensor handles generated from the graph API, and their underlying memory. \n",
+    "\n",
+    "The desired input values need to be stored in these buffers before the `graph.execute` call. Because we have done a reference computation, we can simply reuse the buffers we have allocated via PyTorch.\n",
+    "\n",
+    "Note that the EPISLON UID expects a cpu buffer,"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out_gpu = torch.empty(batch * seq_size, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype)\n",
+    "mean_gpu = torch.empty(batch * seq_size, 1, 1, 1, device=\"cuda\", dtype=torch.float32)\n",
+    "inv_var_gpu = torch.empty(batch * seq_size, 1, 1, 1, device=\"cuda\", dtype=torch.float32)\n",
+    "\n",
+    "# Mapping of (UIDs -> memory)\n",
+    "variant_pack = {\n",
+    "    UID.X.value: x_gpu,\n",
+    "    UID.SCALE0.value: gamma_gpu,\n",
+    "    UID.BIAS.value: bias_gpu,\n",
+    "    UID.EPSILON.value: eps_cpu,\n",
+    "    UID.OUT.value: out_gpu,\n",
+    "    UID.MEAN.value: mean_gpu,\n",
+    "    UID.INV_VAR.value: inv_var_gpu,\n",
+    "    UID.ONE.value: one_cpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=(1 + gamma_gpu).squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "mean_ref = x_gpu.float().mean(dim=(1, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.float(), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Backward Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's use random values as groundtruth to calculate the loss and run backward pass in PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reference backward operation using PyTorch\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "gamma_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we can create the backward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensors associated with the backwards graph.\n",
+    "# DO NOT reuse tensor handles from the forward graph because tensors are not shared across graphs.\n",
+    "d_out = bwd_graph.tensor(\n",
+    "    name=\"d_out\", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype\n",
+    ")\n",
+    "x_bwd = bwd_graph.tensor_like(x, name=\"x\")\n",
+    "gamma_bwd = bwd_graph.tensor_like(gamma, name=\"gamma\")\n",
+    "one_bwd = graph.tensor_like(one_cpu).set_name(\"one\")\n",
+    "mean_bwd = bwd_graph.tensor_like(mean, name=\"mean\")\n",
+    "inv_var_bwd = bwd_graph.tensor_like(inv_var, name=\"inv_var\")\n",
+    "\n",
+    "# A node for pointwise add operation: zero centered gamma + 1\n",
+    "scale_bwd = bwd_graph.add(name=\"gamma_bwd_plus_one\", a=gamma_bwd, b=one_bwd)\n",
+    "\n",
+    "# A node for the layernorm backward operation\n",
+    "d_x, d_scale, d_bias = bwd_graph.layernorm_backward(\n",
+    "    name=\"DLN\",\n",
+    "    grad=d_out,\n",
+    "    input=x_bwd,\n",
+    "    scale=scale_bwd,\n",
+    "    mean=mean_bwd,\n",
+    "    inv_variance=inv_var_bwd,\n",
+    ")\n",
+    "\n",
+    "# Enable outputs.\n",
+    "d_x.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_scale.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_bias.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "\n",
+    "# Build the bwd_graph\n",
+    "bwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph and check correctness against PyTorch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create output buffers for gradients\n",
+    "d_x_gpu = torch.empty_like(x_gpu)\n",
+    "d_scale_gpu = torch.empty_like(gamma_gpu)\n",
+    "d_bias_gpu = torch.empty_like(bias_gpu)\n",
+    "\n",
+    "# For the inputs of the backwards graph (x_bwd, d_out, scale_bwd, mean_bwd, inv_var_bwd), we use the outputs of the forwards graph. For d_out we use pytorches autograd .grad functionality.\n",
+    "variant_pack = {\n",
+    "    x_bwd: x_gpu.detach(),\n",
+    "    gamma_bwd: gamma_gpu.detach(),\n",
+    "    d_out: out_ref.grad,\n",
+    "    mean_bwd: mean_gpu.detach(),\n",
+    "    inv_var_bwd: inv_var_gpu.detach(),\n",
+    "    d_x: d_x_gpu,\n",
+    "    d_scale: d_scale_gpu,\n",
+    "    d_bias: d_bias_gpu,\n",
+    "    one_bwd: one_cpu,\n",
+    "}\n",
+    "workspace = torch.empty(\n",
+    "    bwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "bwd_graph.execute(variant_pack, workspace, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare results and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(gamma_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/25_layernorm_zero_centered_gamma_inference.ipynb b/third_party/cudnn-frontend/samples/python/25_layernorm_zero_centered_gamma_inference.ipynb
new file mode 100644
index 00000000..90d8f9c1
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/25_layernorm_zero_centered_gamma_inference.ipynb
@@ -0,0 +1,387 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# LayerNorm with Zero Centered Gamma: Inference\n",
+    "\n",
+    "This notebook shows how to compute a zero centered gamma layernorm forward inference operation using cuDNN.\n",
+    "\n",
+    "$$\\text{LayerNorm\\_Zero\\_Centered\\_Gamma}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot(1+\\gamma)+\\beta$$\n",
+    "\n",
+    "Where $\\mu = E[x]$ and $\\sigma^2 = Var[x]$ are taken over all inputs in a batch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/24_layernorm_zero_centered_gamma_inference.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we will apply layer norm to a tensor of the following shape:\n",
+    "\n",
+    "- Batch Size: 4\n",
+    "- Sequence Size: 1024\n",
+    "- Embedding Dimension: 768\n",
+    "\n",
+    "Let's define these dimensions as constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 91000\n",
+    "), \"LayerNorm Zero Centered Gamma operation is only supported cuDNN version 9.10.0 or above\"\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3\n",
+    "dtype = torch.float16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "gamma_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "one_cpu = torch.ones(1, 1, 1, 1, dtype=torch.float32, device=\"cpu\")\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of layernorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"gamma_plus_one::a\",\n",
+    "        \"gamma_plus_one::b\",\n",
+    "        \"ln_fwd::input\",\n",
+    "        \"ln_fwd::bias\",\n",
+    "        \"ln_fwd::epsilon\",\n",
+    "    ],\n",
+    "    outputs=[\"ln_fwd::Y\"],\n",
+    ") as fwd_graph:\n",
+    "    # Add a pointwise add operation for zero centered gamma + 1\n",
+    "    scale = fwd_graph.add(\n",
+    "        name=\"gamma_plus_one\",\n",
+    "        a=gamma_gpu,\n",
+    "        b=one_cpu,\n",
+    "    )\n",
+    "    # Add a layernorm operation\n",
+    "    out, mean, inv_var = fwd_graph.layernorm(\n",
+    "        name=\"ln_fwd\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    assert mean is None, \"mean should be None in inference mode\"\n",
+    "    assert inv_var is None, \"inv_var should be None in inference mode\"\n",
+    "    # Enable all outputs, by default outputs are disabled\n",
+    "    out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "\n",
+    "out_gpu = fwd_graph(gamma_gpu, one_cpu, x_gpu, bias_gpu, eps_cpu, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the above, the layer norm node is created with `norm_forward_phase` set to `INFERENCE`. This mean you explicitly use it for inference, hence the `mean` and `inv_var` outputs are not computed.\n",
+    "\n",
+    "You can verify the output with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=(1 + gamma_gpu).squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "gamma_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "bias_gpu = torch.randn(1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "one_cpu = torch.ones(1, 1, 1, 1, dtype=torch.float32, device=\"cpu\")\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we can create the graph for forward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "\n",
+    "\n",
+    "class UID(Enum):\n",
+    "    SCALE0 = 1\n",
+    "    X = 2\n",
+    "    BIAS = 3\n",
+    "    OUT = 5\n",
+    "    ONE = 8\n",
+    "    EPSILON = 9"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph.\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the graph API, assign UIDs.\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\").set_uid(UID.X.value)\n",
+    "gamma = (\n",
+    "    graph.tensor_like(gamma_gpu.detach()).set_name(\"scale0\").set_uid(UID.SCALE0.value)\n",
+    ")\n",
+    "one = graph.tensor_like(one_cpu).set_name(\"one\").set_uid(UID.ONE.value)\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\").set_uid(UID.BIAS.value)\n",
+    "epsilon = graph.tensor_like(eps_cpu).set_name(\"epsilon\").set_uid(UID.EPSILON.value)\n",
+    "\n",
+    "# A node for pointwise add operation: zero centered gamma + 1\n",
+    "scale = graph.add(name=\"gamma_plus_one\", a=gamma, b=one)\n",
+    "\n",
+    "# A node for layernorm operation\n",
+    "out, mean, inv_var = graph.layernorm(\n",
+    "    name=\"layernorm\",\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "    input=x,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Mark output tensors\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(dtype).set_uid(UID.OUT.value)\n",
+    "\n",
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we assign UIDs for tensors. UIDs are a unique identifier that will allow us to provide a mapping from tensors created from cuDNN graph api calls, such as `graph.tensor_like()`, to the underlying device memory that will be used to store these tensors. Virtual tensors don't require explicit memory allocated for them, but non-vritual tensors like inputs or outputs will need to have UIDs assigned to them. \n",
+    "\n",
+    "Alternatively, one can use handles directly in the mapping, however using UIDs can be more convinient for caching of cuDNN graphs.\n",
+    "\n",
+    "For each of our inputs {X, Scale, Bias, Epsilon} and our outputs {Out, _, _}, we allocate a UID.\n",
+    "\n",
+    "After validating and building a cuDNN graph,  we can now execute it. To do this, we have to provide input and output buffers. We do this by using the previously allocated UIDs to associate between tensor handles generated from the graph API, and their underlying memory. \n",
+    "\n",
+    "The desired input values need to be stored in these buffers before the `graph.execute` call. Because we have done a reference computation, we can simply reuse the buffers we have allocated via PyTorch.\n",
+    "\n",
+    "Note that the EPISLON UID expects a cpu buffer,"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping of (UIDs -> memory)\n",
+    "variant_pack = {\n",
+    "    UID.X.value: x_gpu,\n",
+    "    UID.SCALE0.value: gamma_gpu,\n",
+    "    UID.BIAS.value: bias_gpu,\n",
+    "    UID.EPSILON.value: eps_cpu,\n",
+    "    UID.OUT.value: out_gpu,\n",
+    "    UID.ONE.value: one_cpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=(1 + gamma_gpu).squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/26_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb b/third_party/cudnn-frontend/samples/python/26_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb
new file mode 100644
index 00000000..62223c24
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/26_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb
@@ -0,0 +1,801 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Layer Norm with Bitmask Generation\n",
+    "\n",
+    "This notebook shows how to compute forward Layer Norm (training) + clamped ReLU (with bitmask generation), then compute the backward equivalent (DReLU + DLN) using the bitmask. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/25_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we will apply layer norm to a tensor of the following shape:\n",
+    "\n",
+    "- Batch Size: 4\n",
+    "- Sequence Size: 1024\n",
+    "- Embedding Dimension: 768\n",
+    "\n",
+    "Let's define these dimensions as constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 91300\n",
+    "), \"LayerNorm with relu bitmask generation is only supported cuDNN version 9.13.0 or above\"\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "dtype = torch.float32\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3\n",
+    "# Set clamped ReLU limits\n",
+    "lower_clip_val = 0\n",
+    "upper_clip_val = 6"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Additionally, we need to define a helper function to unpack a uint8 bitmask tensor into a boolean tensor. This is to be used when we want to examine the bitmask tensor against the boolean result as computed by PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def unpack_cudnn_bitmask(bitmask_tensor, N, C, H=1, W=1):\n",
+    "    \"\"\"\n",
+    "    Helper function to unpack a bitmask tensor of shape [N, C//8, H, W] and dtype=torch.uint8\n",
+    "    (stored as packed bits) into a boolean tensor of shape [N, C, H, W] for assert testing.\n",
+    "    \"\"\"\n",
+    "    packed = bitmask_tensor.view(N, C // 8, H * W)\n",
+    "    unpacked = torch.zeros((N, C, H * W), dtype=torch.bool, device=packed.device)\n",
+    "\n",
+    "    # populate each bit of the unpacked tensor\n",
+    "    for bit in range(8):\n",
+    "        bit_values = (packed >> bit) & 1\n",
+    "        unpacked[:, bit::8, :] = bit_values\n",
+    "\n",
+    "    unpacked = unpacked.view(N, C, H, W)\n",
+    "    return unpacked"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LayerNorm Relu Bitmask Training Forward Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we define the input tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# random tensors as input\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "\n",
+    "# Epsilon, lower clip, and upper clip must be a scalar value on the cpu.\n",
+    "epsilon_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), epsilon_value, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")\n",
+    "lower_clip_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), lower_clip_val, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")\n",
+    "upper_clip_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), upper_clip_val, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, create the graph for the forward pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.FLOAT,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ") as fwd_graph:\n",
+    "    # layernorm forward pass\n",
+    "    norm_out, mean, inv_var = fwd_graph.layernorm(\n",
+    "        name=\"LN\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=epsilon_cpu,\n",
+    "    )\n",
+    "    # relu on the layernorm output\n",
+    "    out = fwd_graph.relu(\n",
+    "        name=\"ReLU\",\n",
+    "        input=norm_out,\n",
+    "        lower_clip=lower_clip_val,\n",
+    "        upper_clip=upper_clip_val,\n",
+    "    )\n",
+    "    # generate bitmask with the ReLU output\n",
+    "    lower_clip_mask = fwd_graph.cmp_gt(\n",
+    "        name=\"lower_mask\",\n",
+    "        input=out,\n",
+    "        comparison=lower_clip_cpu,\n",
+    "    )\n",
+    "    lower_clip_mask.set_name(\"lower_clip\").set_data_type(cudnn.data_type.BOOLEAN)\n",
+    "    upper_clip_mask = fwd_graph.cmp_lt(\n",
+    "        name=\"upper_mask\",\n",
+    "        input=out,\n",
+    "        comparison=upper_clip_cpu,\n",
+    "    )\n",
+    "    upper_clip_mask.set_name(\"upper_clip\").set_data_type(cudnn.data_type.BOOLEAN)\n",
+    "    bitmask = fwd_graph.logical_and(\n",
+    "        name=\"and_bitmask\",\n",
+    "        a=lower_clip_mask,\n",
+    "        b=upper_clip_mask,\n",
+    "    )\n",
+    "    # mark the output tensors\n",
+    "    out.set_name(\"output\").set_output(True)\n",
+    "    mean.set_name(\"mean\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "    inv_var.set_name(\"inv_var\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "    bitmask.set_name(\"relu_bitmask\").set_output(True).set_data_type(\n",
+    "        cudnn.data_type.BOOLEAN\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, execute the graph and compare the output to the reference output from PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocated output tensors\n",
+    "out_gpu = torch.empty_like(x_gpu)\n",
+    "mean_gpu = torch.empty(batch * seq_size, 1, 1, 1, dtype=torch.float32, device=\"cuda\")\n",
+    "inv_var_gpu = torch.empty(batch * seq_size, 1, 1, 1, dtype=torch.float32, device=\"cuda\")\n",
+    "# CuDNN stores boolean bitmask values as bit-packed int8_t.\n",
+    "mask_gpu = torch.empty(\n",
+    "    ((batch * seq_size), embedding_dim // 8, 1, 1), dtype=torch.uint8, device=\"cuda\"\n",
+    ")\n",
+    "\n",
+    "# execute the graph\n",
+    "output = fwd_graph(\n",
+    "    {\n",
+    "        # input tensors\n",
+    "        \"LN::input\": x_gpu,\n",
+    "        \"LN::scale\": scale_gpu,\n",
+    "        \"LN::bias\": bias_gpu,\n",
+    "        \"LN::epsilon\": epsilon_cpu,\n",
+    "        \"lower_mask::comparison\": lower_clip_cpu,\n",
+    "        \"upper_mask::comparison\": upper_clip_cpu,\n",
+    "        # output tensors\n",
+    "        \"output\": out_gpu,\n",
+    "        \"mean\": mean_gpu,\n",
+    "        \"inv_var\": inv_var_gpu,\n",
+    "        \"relu_bitmask\": mask_gpu,\n",
+    "    },\n",
+    "    handle=handle,\n",
+    ")\n",
+    "\n",
+    "# PyTorch reference forward operation\n",
+    "normalized_x = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "out_ref = torch.clamp(normalized_x, min=lower_clip_val, max=upper_clip_val)\n",
+    "mask_ref = (lower_clip_val < out_ref) & (out_ref < upper_clip_val)\n",
+    "mean_ref = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(out_gpu, out_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, rtol=5e-3, atol=5e-3)\n",
+    "\n",
+    "# Unpack the bitmask tensor and compare to reference output\n",
+    "unpacked_mask = unpack_cudnn_bitmask(mask_gpu, batch * seq_size, embedding_dim, 1, 1)\n",
+    "torch.testing.assert_close(unpacked_mask, mask_ref, atol=1e-3, rtol=1e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LayerNorm Relu Bitmask Backward Pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reference backward operation using PyTorch\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Backward pass\n",
+    "with cudnn.Graph(\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"drelu_bitmask_mul::a\",\n",
+    "        \"drelu_bitmask_mul::b\",\n",
+    "        \"DLN::input\",\n",
+    "        \"DLN::scale\",\n",
+    "        \"DLN::mean\",\n",
+    "        \"DLN::inv_variance\",\n",
+    "    ],\n",
+    "    outputs=[\"dX\", \"dScale\", \"dBias\"],\n",
+    ") as bwd_graph:\n",
+    "    # pointwise mul operation for dRelu using the bitmask\n",
+    "    drelu_dY = bwd_graph.mul(\n",
+    "        name=\"drelu_bitmask_mul\",\n",
+    "        a=out_ref.grad,\n",
+    "        b=mask_ref,\n",
+    "    )\n",
+    "    # the layernorm backward operation\n",
+    "    d_x, d_scale, d_bias = bwd_graph.layernorm_backward(\n",
+    "        name=\"DLN\",\n",
+    "        grad=drelu_dY,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        mean=mean_gpu,\n",
+    "        inv_variance=inv_var_gpu,\n",
+    "    )\n",
+    "    # mark the output tensors\n",
+    "    d_x.set_output(True).set_name(\"dX\").set_data_type(dtype)\n",
+    "    d_scale.set_output(True).set_name(\"dScale\").set_data_type(dtype)\n",
+    "    d_bias.set_output(True).set_name(\"dBias\").set_data_type(dtype)\n",
+    "\n",
+    "# Execute the backward graph\n",
+    "d_x_gpu, d_scale_gpu, d_bias_gpu = bwd_graph(\n",
+    "    out_ref.grad,\n",
+    "    mask_gpu.detach(),\n",
+    "    x_gpu.detach(),\n",
+    "    scale_gpu.detach(),\n",
+    "    mean_gpu.detach(),\n",
+    "    inv_var_gpu.detach(),\n",
+    "    handle=handle,\n",
+    ")\n",
+    "\n",
+    "# compare to reference output\n",
+    "torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(scale_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LayerNorm ReLU Bitmask Training Forward Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Allocate input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_size,\n",
+    "    embedding_dim,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, embedding_dim, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "\n",
+    "# Epsilon, lower clip, and upper clip must be a scalar value on the cpu.\n",
+    "epsilon_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), epsilon_value, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")\n",
+    "lower_clip_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), lower_clip_val, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")\n",
+    "upper_clip_cpu = torch.full(\n",
+    "    (1, 1, 1, 1), upper_clip_val, dtype=torch.float32, requires_grad=False, device=\"cpu\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then we create the graph for the forward pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph.\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    io_data_type=cudnn.data_type.FLOAT,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the graph API, assign UIDs.\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = graph.tensor_like(epsilon_cpu).set_name(\"epsilon\")\n",
+    "lower_clip = graph.tensor_like(lower_clip_cpu).set_name(\"lower_clip\")\n",
+    "upper_clip = graph.tensor_like(upper_clip_cpu).set_name(\"upper_clip\")\n",
+    "\n",
+    "# Add a layernorm operation\n",
+    "norm_out, mean, inv_var = graph.layernorm(\n",
+    "    name=\"layernorm\",\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    input=x,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Add a relu operation\n",
+    "out = graph.relu(\n",
+    "    name=\"relu\", input=norm_out, lower_clip=lower_clip_val, upper_clip=upper_clip_val\n",
+    ")\n",
+    "\n",
+    "# Add logical operations for generating bitmask\n",
+    "lower_clip_mask = graph.cmp_gt(\n",
+    "    name=\"cmp_gt_lower_clip\", input=out, comparison=lower_clip\n",
+    ")\n",
+    "lower_clip_mask.set_name(\"lower_clip\").set_data_type(cudnn.data_type.BOOLEAN)\n",
+    "upper_clip_mask = graph.cmp_lt(\n",
+    "    name=\"cmp_lt_upper_clip\", input=out, comparison=upper_clip\n",
+    ")\n",
+    "upper_clip_mask.set_name(\"upper_clip\").set_data_type(cudnn.data_type.BOOLEAN)\n",
+    "bitmask = graph.logical_and(name=\"and_bitmask\", a=lower_clip_mask, b=upper_clip_mask)\n",
+    "bitmask.set_name(\"upper_clip\").set_data_type(cudnn.data_type.BOOLEAN)\n",
+    "\n",
+    "# Enable all outputs, by default outputs are disabled\n",
+    "out.set_name(\"output\").set_output(True)\n",
+    "mean.set_name(\"mean\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "bitmask.set_name(\"relu_bitmask\").set_output(True)\n",
+    "\n",
+    "# print(graph)\n",
+    "\n",
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we assign UIDs for tensors. UIDs are a unique identifier that will allow us to provide a mapping from tensors created from cuDNN graph api calls, such as `graph.tensor_like()`, to the underlying device memory that will be used to store these tensors. Virtual tensors don't require explicit memory allocated for them, but non-vritual tensors like inputs or outputs will need to have UIDs assigned to them. \n",
+    "\n",
+    "Alternatively, one can use handles directly in the mapping, however using UIDs can be more convinient for caching of cuDNN graphs.\n",
+    "\n",
+    "For each of our inputs {X, Scale, Bias, Epsilon} and our outputs {Out, Mean, Inverse Variance}, we allocate a UID. \n",
+    "\n",
+    "After validating and building a cuDNN graph,  we can now execute it. To do this, we have to provide input and output buffers. We do this by using the previously allocated UIDs to associate between tensor handles generated from the graph API, and their underlying memory. \n",
+    "\n",
+    "The desired input values need to be stored in these buffers before the `graph.execute` call. Because we have done a reference computation, we can simply reuse the buffers we have allocated via PyTorch.\n",
+    "\n",
+    "Note that the EPISLON UID expects a cpu buffer, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Allocate output tensor memory.\n",
+    "out_gpu = torch.empty_like(x_gpu)\n",
+    "mean_gpu = torch.empty(batch * seq_size, dtype=torch.float32, device=\"cuda\")\n",
+    "inv_var_gpu = torch.empty(batch * seq_size, dtype=torch.float32, device=\"cuda\")\n",
+    "\n",
+    "# CuDNN stores boolean bitmask values as bit-packed int8_t.\n",
+    "mask_gpu = torch.empty(\n",
+    "    ((batch * seq_size), embedding_dim // 8, 1, 1), dtype=torch.uint8, device=\"cuda\"\n",
+    ")\n",
+    "\n",
+    "# mapping of handles -> memory\n",
+    "variant_pack = {\n",
+    "    x: x_gpu,\n",
+    "    scale: scale_gpu,\n",
+    "    bias: bias_gpu,\n",
+    "    epsilon: epsilon_cpu,\n",
+    "    out: out_gpu,\n",
+    "    mean: mean_gpu,\n",
+    "    inv_var: inv_var_gpu,\n",
+    "    lower_clip: lower_clip_cpu,\n",
+    "    upper_clip: upper_clip_cpu,\n",
+    "    bitmask: mask_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compute reference ouputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reference forward operation using PyTorch\n",
+    "normalized_x = torch.nn.functional.layer_norm(\n",
+    "    x_gpu,\n",
+    "    [embedding_dim, 1, 1],\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    bias=bias_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "out_ref = torch.clamp(normalized_x, min=lower_clip_val, max=upper_clip_val)\n",
+    "mask_ref = (lower_clip_val < out_ref) & (out_ref < upper_clip_val)\n",
+    "mean_ref = x_gpu.to(torch.float32).mean(dim=(1, 2, 3))\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3)) + epsilon_value\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compare to reference output\n",
+    "torch.testing.assert_close(out_gpu, out_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, rtol=5e-3, atol=5e-3)\n",
+    "\n",
+    "# Unpack the bitmask tensor and compare to reference output\n",
+    "unpacked_mask = unpack_cudnn_bitmask(mask_gpu, batch * seq_size, embedding_dim, 1, 1)\n",
+    "torch.testing.assert_close(unpacked_mask, mask_ref, atol=1e-3, rtol=1e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### LayerNorm Relu Bitmask Backward Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Compute references values for backward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reference backward operation using PyTorch\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create cuDNN graph and tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensors associated with the backwards graph. DO NOT reuse tensor handles from the forward graph.\n",
+    "d_out = bwd_graph.tensor(\n",
+    "    name=\"d_out\", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype\n",
+    ")\n",
+    "\n",
+    "x_bwd = bwd_graph.tensor_like(x, name=\"x\")\n",
+    "scale_bwd = bwd_graph.tensor_like(scale, name=\"scale\")\n",
+    "mean_bwd = bwd_graph.tensor_like(mean, name=\"mean\")\n",
+    "inv_var_bwd = bwd_graph.tensor_like(inv_var, name=\"inv_var\")\n",
+    "bitmask_bwd = bwd_graph.tensor(\n",
+    "    name=\"bitmask\",\n",
+    "    dim=(batch * seq_size, embedding_dim, 1, 1),\n",
+    "    stride=(embedding_dim, 1, 1, 1),\n",
+    "    data_type=cudnn.data_type.BOOLEAN,\n",
+    ")\n",
+    "\n",
+    "# a pointwise mul operation for dRelu using the bitmask\n",
+    "drelu_dY = bwd_graph.mul(name=\"drelu_bitmask_mul\", a=d_out, b=bitmask_bwd)\n",
+    "drelu_dY.set_name(\"dRelu(dY)\")\n",
+    "print(\"drelu_x_bwd:\", drelu_dY.get_dim())\n",
+    "\n",
+    "# the layernorm backward operation\n",
+    "d_x, d_scale, d_bias = bwd_graph.layernorm_backward(\n",
+    "    name=\"DLN\",\n",
+    "    grad=drelu_dY,\n",
+    "    input=x_bwd,\n",
+    "    scale=scale_bwd,\n",
+    "    mean=mean_bwd,\n",
+    "    inv_variance=inv_var_bwd,\n",
+    ")\n",
+    "\n",
+    "# Enable outputs.\n",
+    "d_x.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_scale.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_bias.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "\n",
+    "# print(bwd_graph)\n",
+    "\n",
+    "# Build the bwd_graph\n",
+    "bwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create output buffers for gradients\n",
+    "d_x_gpu = torch.empty_like(x_gpu)\n",
+    "d_scale_gpu = torch.empty_like(scale_gpu)\n",
+    "d_bias_gpu = torch.empty_like(bias_gpu)\n",
+    "\n",
+    "workspace = torch.empty(\n",
+    "    bwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "\n",
+    "# For the inputs of the backwards graph (x_bwd, d_out, scale_bwd, mean_bwd, inv_var_bwd), we use the outputs of the forwards graph. For d_out we use pytorches autograd .grad functionality.\n",
+    "variant_pack = {\n",
+    "    x_bwd: x_gpu.detach(),\n",
+    "    scale_bwd: scale_gpu.detach(),\n",
+    "    d_out: out_ref.grad,\n",
+    "    mean_bwd: mean_gpu.detach(),\n",
+    "    inv_var_bwd: inv_var_gpu.detach(),\n",
+    "    d_x: d_x_gpu,\n",
+    "    d_scale: d_scale_gpu,\n",
+    "    d_bias: d_bias_gpu,\n",
+    "    bitmask_bwd: mask_gpu.detach(),\n",
+    "}\n",
+    "bwd_graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compare to reference output\n",
+    "torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(scale_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/27_adaptive_layernorm_forward_training_and_backward.ipynb b/third_party/cudnn-frontend/samples/python/27_adaptive_layernorm_forward_training_and_backward.ipynb
new file mode 100644
index 00000000..7fbc560e
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/27_adaptive_layernorm_forward_training_and_backward.ipynb
@@ -0,0 +1,543 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adaptive LayerNorm: Forward and Backward\n",
+    "\n",
+    "This notebook shows how the forward training and backward pass of an adaptive layer norm operation can be done using cuDNN.\n",
+    "\n",
+    "$$\\text{Adaptive\\_LayerNorm}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot\\gamma+\\beta$$\n",
+    "\n",
+    "Where $\\mu = E[x]$ and $\\sigma^2 = Var[x]$ are taken over all inputs in a batch, $\\gamma$ and $\\beta$ are learnable parameters and varies for each input in a batch. This is in contrast to the layer norm where $\\gamma$ and $\\beta$ are shared across all inputs in a batch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/26_adaptive_layernorm_forward_training_and_backward.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we will apply adaptive layer norm to a tensor of the following shape:\n",
+    "\n",
+    "- Batch Size: 4\n",
+    "- Sequence Size: 1024\n",
+    "- Embedding Dimension: 768\n",
+    "\n",
+    "Let's define these dimensions as constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3\n",
+    "dtype = torch.float16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Forward Pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch, seq_size, embedding_dim, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ")\n",
+    "scale_gpu = torch.randn(\n",
+    "    batch, 1, embedding_dim, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ")\n",
+    "bias_gpu = torch.randn(\n",
+    "    batch, 1, embedding_dim, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ")\n",
+    "eps_cpu = torch.full((1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of layernorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"adaln::input\", \"adaln::scale\", \"adaln::bias\", \"adaln::epsilon\"],\n",
+    "    outputs=[\"adaln::Y\", \"adaln::MEAN\", \"adaln::INV_VARIANCE\"],\n",
+    ") as fwd_graph:\n",
+    "    out, mean, inv_var = fwd_graph.adalayernorm(\n",
+    "        name=\"adaln\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "    mean.set_name(\"mean\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "    inv_var.set_name(\"inv_var\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "\n",
+    "out_gpu, mean_gpu, inv_var_gpu = fwd_graph(\n",
+    "    x_gpu.detach(), scale_gpu.detach(), bias_gpu.detach(), eps_cpu, handle=handle\n",
+    ")\n",
+    "\n",
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(x_gpu, (embedding_dim,), eps=epsilon_value)\n",
+    "out_ref = out_ref * scale_gpu + bias_gpu\n",
+    "mean_ref = x_gpu.float().mean(dim=2, keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(torch.var(x_gpu.float(), dim=2, keepdim=True) + epsilon_value)\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Comparing this with the case of [layer norm](20_layernorm_forward.ipynb), you can see that the arguments to the operation are the same, except that the `scale` and `bias` arguments are in a different dimension size corresponding to the `input` tensor."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Backward pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute gradients: Ask PyTorch not to discard the grads after use so that we can read it twice\n",
+    "# out_ref.grad will be used in the cudnn graph, x_gpu.grad, scale_gpu.grad, and bias_gpu.grad will\n",
+    "# be used to compare with the cudnn graph output.\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()\n",
+    "\n",
+    "# Backward pass\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"Dadaln::grad\",\n",
+    "        \"Dadaln::input\",\n",
+    "        \"Dadaln::scale\",\n",
+    "        \"Dadaln::mean\",\n",
+    "        \"Dadaln::inv_variance\",\n",
+    "    ],\n",
+    "    outputs=[\"Dadaln::DX\", \"Dadaln::DSCALE\", \"Dadaln::DBIAS\"],\n",
+    ") as bwd_graph:\n",
+    "    dx, dscale, dbias = bwd_graph.adalayernorm_backward(\n",
+    "        name=\"Dadaln\",\n",
+    "        grad=out_ref.grad,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        mean=mean_gpu,\n",
+    "        inv_variance=inv_var_gpu,\n",
+    "    )\n",
+    "    dx.set_output(True).set_data_type(dtype)\n",
+    "    dscale.set_output(True).set_data_type(dtype)\n",
+    "    dbias.set_output(True).set_data_type(dtype)\n",
+    "\n",
+    "dx_gpu, dscale_gpu, dbias_gpu = bwd_graph(\n",
+    "    out_ref.grad,\n",
+    "    x_gpu.detach(),\n",
+    "    scale_gpu.detach(),\n",
+    "    mean_gpu.detach(),\n",
+    "    inv_var_gpu.detach(),\n",
+    "    handle=handle,\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(x_gpu.grad, dx_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(scale_gpu.grad, dscale_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(bias_gpu.grad, dbias_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Adaptive LayerNorm Forward Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(\n",
+    "    batch, seq_size, embedding_dim, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ")\n",
+    "scale_gpu = torch.randn(\n",
+    "    batch, 1, embedding_dim, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ")\n",
+    "bias_gpu = torch.randn(\n",
+    "    batch, 1, embedding_dim, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ")\n",
+    "eps_cpu = torch.full((1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then create the graph for the forward pass."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph\n",
+    "fwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the fwd_graph\n",
+    "x = fwd_graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = fwd_graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = fwd_graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = fwd_graph.tensor_like(eps_cpu).set_name(\"epsilon\")\n",
+    "\n",
+    "# Add a layernorm operation\n",
+    "out, mean, inv_var = fwd_graph.adalayernorm(\n",
+    "    name=\"ADALN\",\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    input=x,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Enable all outputs\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "mean.set_name(\"mean\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "\n",
+    "# Build the fwd_graph\n",
+    "fwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the forward graph.\n",
+    "Instead of mapping UIDs to memory (as in [20_layernorm.ipynb](20_layernorm.ipynb)), we can directly map handles to memory. This is simpler but slightly slower to execute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mapping of (handles -> memory)\n",
+    "variant_pack = {\n",
+    "    x: x_gpu.detach(),\n",
+    "    scale: scale_gpu.detach(),\n",
+    "    bias: bias_gpu.detach(),\n",
+    "    epsilon: eps_cpu,\n",
+    "    out: out_gpu,\n",
+    "    mean: mean_gpu,\n",
+    "    inv_var: inv_var_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(\n",
+    "    fwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "fwd_graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(x_gpu, (embedding_dim,), eps=epsilon_value)\n",
+    "out_ref = out_ref * scale_gpu + bias_gpu\n",
+    "mean_ref = x_gpu.float().mean(dim=2, keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(torch.var(x_gpu.float(), dim=2, keepdim=True) + epsilon_value)\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Adaptive LayerNorm Backward Pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's compute the references values for backward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reference backward operation using PyTorch\n",
+    "target = torch.randn_like(out_ref)  # random as ground truth\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "# keep grads for comparison\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the backward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensors associated with the backwards graph. DO NOT reuse tensor handles from the forward graph.\n",
+    "d_out = bwd_graph.tensor(\n",
+    "    name=\"d_out\", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype\n",
+    ")\n",
+    "x_bwd = bwd_graph.tensor_like(x, name=\"x\")\n",
+    "scale_bwd = bwd_graph.tensor_like(scale, name=\"scale\")\n",
+    "mean_bwd = bwd_graph.tensor_like(mean, name=\"mean\")\n",
+    "inv_var_bwd = bwd_graph.tensor_like(inv_var, name=\"inv_var\")\n",
+    "\n",
+    "# Add the adaptive layernorm backward operation\n",
+    "d_x, d_scale, d_bias = bwd_graph.adalayernorm_backward(\n",
+    "    name=\"DADALN\",\n",
+    "    grad=d_out,\n",
+    "    input=x_bwd,\n",
+    "    scale=scale_bwd,\n",
+    "    mean=mean_bwd,\n",
+    "    inv_variance=inv_var_bwd,\n",
+    ")\n",
+    "\n",
+    "# Enable outputs.\n",
+    "d_x.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_scale.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "d_bias.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "\n",
+    "# Build the bwd_graph\n",
+    "bwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph and check correctness against PyTorch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create output buffers for gradients\n",
+    "d_x_gpu = torch.empty_like(x_gpu)\n",
+    "d_scale_gpu = torch.empty_like(scale_gpu)\n",
+    "d_bias_gpu = torch.empty_like(bias_gpu)\n",
+    "\n",
+    "# For the inputs of the backwards graph (x_bwd, d_out, scale_bwd, mean_bwd, inv_var_bwd), we use the\n",
+    "# outputs of the forwards graph. For d_out we use pytorches autograd .grad functionality.\n",
+    "variant_pack = {\n",
+    "    x_bwd: x_gpu.detach(),\n",
+    "    scale_bwd: scale_gpu.detach(),\n",
+    "    d_out: out_ref.grad,\n",
+    "    mean_bwd: mean_gpu.detach(),\n",
+    "    inv_var_bwd: inv_var_gpu.detach(),\n",
+    "    d_x: d_x_gpu,\n",
+    "    d_scale: d_scale_gpu,\n",
+    "    d_bias: d_bias_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(\n",
+    "    bwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "bwd_graph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare results and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compare to reference output\n",
+    "torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(scale_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/28_adaptive_layernorm_inference.ipynb b/third_party/cudnn-frontend/samples/python/28_adaptive_layernorm_inference.ipynb
new file mode 100644
index 00000000..c2020352
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/28_adaptive_layernorm_inference.ipynb
@@ -0,0 +1,341 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adaptive LayerNorm: Inference\n",
+    "\n",
+    "This notebook shows how to compute an adaptive layernorm forward operation using cuDNN.\n",
+    "\n",
+    "$$\\text{Adaptive\\_LayerNorm}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot\\gamma+\\beta$$\n",
+    "\n",
+    "Where $\\mu = E[x]$ and $\\sigma^2 = Var[x]$ are taken over all inputs in a batch, $\\gamma$ and $\\beta$ are learnable parameters and varies for each input in a batch. This is in contrast to the layer norm where $\\gamma$ and $\\beta$ are shared across all inputs in a batch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/27_adaptive_layernorm_inference.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we will apply adaptive layer norm to a tensor of the following shape:\n",
+    "\n",
+    "- Batch Size: 4\n",
+    "- Sequence Size: 1024\n",
+    "- Embedding Dimension: 768\n",
+    "\n",
+    "Let's define these dimensions as constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "batch, seq_size, embedding_dim = 4, 1024, 768\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3\n",
+    "dtype = torch.float16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is how to do inference with adaptive layernorm. This is mostly the same as the forward pass in training except that we are not expecting the mean and variance to be computed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(batch, seq_size, embedding_dim, device=\"cuda\", dtype=dtype)\n",
+    "scale_gpu = torch.randn(batch, 1, embedding_dim, device=\"cuda\", dtype=dtype)\n",
+    "bias_gpu = torch.randn(batch, 1, embedding_dim, device=\"cuda\", dtype=dtype)\n",
+    "eps_cpu = torch.full((1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of adaptive layernorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"adaln::input\", \"adaln::scale\", \"adaln::bias\", \"adaln::epsilon\"],\n",
+    "    outputs=[\"adaln::Y\"],\n",
+    ") as fwd_graph:\n",
+    "    out, mean, inv_var = fwd_graph.adalayernorm(\n",
+    "        name=\"adaln\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    assert mean is None, \"mean should be None in inference mode\"\n",
+    "    assert inv_var is None, \"inv_var should be None in inference mode\"\n",
+    "    out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "\n",
+    "out_gpu = fwd_graph(x_gpu, scale_gpu, bias_gpu, eps_cpu, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can verify the output by comparing it to the result from PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(x_gpu, (embedding_dim,), eps=epsilon_value)\n",
+    "out_ref = out_ref * scale_gpu + bias_gpu\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(batch, seq_size, embedding_dim, device=\"cuda\", dtype=dtype)\n",
+    "scale_gpu = torch.randn(batch, 1, embedding_dim, device=\"cuda\", dtype=dtype)\n",
+    "bias_gpu = torch.randn(batch, 1, embedding_dim, device=\"cuda\", dtype=dtype)\n",
+    "eps_cpu = torch.full((1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then create the graph:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from enum import Enum\n",
+    "\n",
+    "\n",
+    "class UID(Enum):\n",
+    "    X = 0\n",
+    "    SCALE = 1\n",
+    "    BIAS = 2\n",
+    "    EPSILON = 3\n",
+    "    OUT = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the cuDNN graph.\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# Create tensor handles with the graph API, assign UIDs.\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\").set_uid(UID.X.value)\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\").set_uid(UID.SCALE.value)\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\").set_uid(UID.BIAS.value)\n",
+    "epsilon = graph.tensor_like(eps_cpu).set_name(\"epsilon\").set_uid(UID.EPSILON.value)\n",
+    "\n",
+    "# Add a layernorm operation\n",
+    "out, _, _ = graph.adalayernorm(\n",
+    "    name=\"ADALN\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# Enable all outputs, by default outputs are disabled\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(dtype).set_uid(UID.OUT.value)\n",
+    "# print(graph)\n",
+    "\n",
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we assign UIDs for tensors. UIDs are a unique identifier that will allow us to provide a mapping from tensors created from cuDNN graph api calls, such as `graph.tensor_like()`, to the underlying device memory that will be used to store these tensors. Virtual tensors don't require explicit memory allocated for them, but non-vritual tensors like inputs or outputs will need to have UIDs assigned to them. \n",
+    "\n",
+    "Alternatively, one can use handles directly in the mapping, however using UIDs can be more convinient for caching of cuDNN graphs.\n",
+    "\n",
+    "For each of our inputs {X, Scale, Bias, Epsilon} and our output Out, we allocate a UID. \n",
+    "\n",
+    "After validating and building a cuDNN graph, we can now execute it. To do this, we have to provide input and output buffers. We do this by using the previously allocated UIDs to associate between tensor handles generated from the graph API, and their underlying memory.\n",
+    "\n",
+    "The desired input values need to be stored in these buffers before the `graph.execute` call. Because we have done a reference computation, we can simply reuse the buffers we have allocated via PyTorch.\n",
+    "\n",
+    "Note that the EPISLON UID expects a cpu buffer, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Allocate output tensor memory using PyTorch\n",
+    "out_gpu = torch.empty_like(x_gpu)\n",
+    "\n",
+    "# Mapping of (UIDs -> memory)\n",
+    "variant_pack = {\n",
+    "    UID.X.value: x_gpu,\n",
+    "    UID.SCALE.value: scale_gpu,\n",
+    "    UID.BIAS.value: bias_gpu,\n",
+    "    UID.EPSILON.value: eps_cpu,\n",
+    "    UID.OUT.value: out_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch reference output\n",
+    "out_ref = torch.nn.functional.layer_norm(x_gpu, (embedding_dim,), eps=epsilon_value)\n",
+    "out_ref = out_ref * scale_gpu + bias_gpu\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/29_rmsnorm.ipynb b/third_party/cudnn-frontend/samples/python/29_rmsnorm.ipynb
new file mode 100644
index 00000000..6170ac68
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/29_rmsnorm.ipynb
@@ -0,0 +1,520 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# RMSNorm Operation\n",
+    "\n",
+    "This notebook shows how to compute an RMS norm using the cuDNN python frontend.\n",
+    "\n",
+    "$$\\text{RMSNorm}(x) = \\frac{x}{\\sqrt{\\mathbb{E}(x^2) + \\epsilon}}\\cdot\\gamma+\\beta$$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/28_rmsnorm.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the RMS norm forward pass with the following problem sizes:\n",
+    "\n",
+    "- batch size: 4\n",
+    "- sequence length: 1024\n",
+    "- hidden dimension: 768\n",
+    "\n",
+    "The tensor will be in 16-bit float format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    cudnn.backend_version_string() >= \"9.1.0\"\n",
+    "), \"RMSNorm requires cuDNN 9.1.0 or higher\"\n",
+    "\n",
+    "batch, seq_length, hidden_size = 4, 1024, 128\n",
+    "dtype = torch.float16\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is how you can use the `Graph` wrapper to perform RMS norm with the input tensors in PyTorch format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# forward pass of rmsnorm using PyTorch\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_length,\n",
+    "    hidden_size,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, hidden_size, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, hidden_size, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of rmsnorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"rms_fwd::input\", \"rms_fwd::scale\", \"rms_fwd::bias\", \"rms_fwd::epsilon\"],\n",
+    "    outputs=[\"rms_fwd::Y\", \"rms_fwd::INV_VARIANCE\"],\n",
+    ") as rmsnorm_graph:\n",
+    "    out, inv_var = rmsnorm_graph.rmsnorm(\n",
+    "        name=\"rms_fwd\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    # set output, inv_var must be float32 tensor\n",
+    "    out.set_output(True).set_data_type(dtype)\n",
+    "    inv_var.set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "\n",
+    "# execute the graph and retrieve the output tensors\n",
+    "out_gpu, inv_var_gpu = rmsnorm_graph(x_gpu, scale_gpu, bias_gpu, eps_cpu, handle=handle)\n",
+    "\n",
+    "# verify the result with PyTorch API\n",
+    "out_ref = torch.nn.functional.rms_norm(\n",
+    "    x_gpu,\n",
+    "    [hidden_size, 1, 1],  # RMS norm over last 3 dimensions\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "out_ref = out_ref + bias_gpu\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.mean(x_gpu.float().pow(2), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above performs forward pass of RMSNorm. Next, we will perform the backward pass:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute gradients: Ask PyTorch not to discard the grads after use so that we can read it twice\n",
+    "# out_ref.grad will be used in the cudnn graph, x_gpu.grad, scale_gpu.grad, and bias_gpu.grad will\n",
+    "# be used to compare with the cudnn graph output.\n",
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()\n",
+    "\n",
+    "# Backward pass\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"rms_bwd::grad\",\n",
+    "        \"rms_bwd::input\",\n",
+    "        \"rms_bwd::scale\",\n",
+    "        \"rms_bwd::inv_variance\",\n",
+    "    ],\n",
+    "    outputs=[\"rms_bwd::DX\", \"rms_bwd::Dscale\", \"rms_bwd::Dbias\"],\n",
+    ") as bwd_graph:\n",
+    "    dx, dscale, dbias = bwd_graph.rmsnorm_backward(\n",
+    "        name=\"rms_bwd\",\n",
+    "        grad=out_ref,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        inv_variance=inv_var_gpu,\n",
+    "        has_dbias=True,\n",
+    "    )\n",
+    "    dx.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "    dscale.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "    dbias.set_output(True).set_data_type(x_gpu.dtype)\n",
+    "\n",
+    "# execute the graph and retrieve the output tensors\n",
+    "dx_gpu, dscale_gpu, dbias_gpu = bwd_graph(\n",
+    "    out_ref.grad, x_gpu, scale_gpu, inv_var_gpu, handle=handle\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(x_gpu.grad, dx_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(scale_gpu.grad, dscale_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(bias_gpu.grad, dbias_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### RMS norm forward pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(\n",
+    "    batch * seq_length,\n",
+    "    hidden_size,\n",
+    "    1,\n",
+    "    1,\n",
+    "    device=\"cuda\",\n",
+    "    dtype=dtype,\n",
+    "    requires_grad=True,\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "scale_gpu = torch.randn(\n",
+    "    1, hidden_size, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(\n",
+    "    1, hidden_size, 1, 1, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").to(memory_format=torch.channels_last)\n",
+    "# set epsilon to epsilon_value, allocate on cpu.\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create cuDNN graph and cuDNN tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# create tensor handles with the graph API\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = graph.tensor_like(eps_cpu).set_name(\"epsilon\")\n",
+    "\n",
+    "out, inv_var = graph.rmsnorm(\n",
+    "    name=\"rmsnorm\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# enable all outputs\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(cudnn.data_type.FLOAT);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build and execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "# Mapping of (handles -> memory)\n",
+    "variant_pack = {\n",
+    "    x: x_gpu.detach(),\n",
+    "    scale: scale_gpu.detach(),\n",
+    "    bias: bias_gpu.detach(),\n",
+    "    epsilon: eps_cpu,\n",
+    "    out: out_gpu,\n",
+    "    inv_var: inv_var_gpu,\n",
+    "}\n",
+    "\n",
+    "# Execute the graph\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compute reference ouputs and verify"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reference output using PyTorch\n",
+    "out_ref = torch.nn.functional.rms_norm(\n",
+    "    x_gpu,\n",
+    "    [hidden_size, 1, 1],  # RMS norm over last 3 dimensions\n",
+    "    weight=scale_gpu.squeeze(0),\n",
+    "    eps=epsilon_value,\n",
+    ")\n",
+    "out_ref = out_ref + bias_gpu\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.mean(x_gpu.float().pow(2), dim=(1, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "\n",
+    "# Test cuDNN's output against PyTorch's and check correctness\n",
+    "torch.testing.assert_close(out_gpu, out_ref, rtol=5e-3, atol=5e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, rtol=5e-3, atol=5e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### RMSNorm Backwards Pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target = torch.randn_like(out_ref)\n",
+    "criterion = torch.nn.MSELoss()\n",
+    "loss = criterion(out_ref, target)\n",
+    "\n",
+    "out_ref.retain_grad()\n",
+    "x_gpu.retain_grad()\n",
+    "scale_gpu.retain_grad()\n",
+    "bias_gpu.retain_grad()\n",
+    "\n",
+    "loss.backward()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bwd_graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "d_out = bwd_graph.tensor_like(out_ref.grad)\n",
+    "x_bwd = bwd_graph.tensor_like(x, name=\"x\")\n",
+    "scale_bwd = bwd_graph.tensor_like(scale, name=\"scale\")\n",
+    "inv_var_bwd = bwd_graph.tensor_like(inv_var, name=\"inv_var\")\n",
+    "\n",
+    "d_x, d_scale, d_bias = bwd_graph.rmsnorm_backward(\n",
+    "    name=\"d_rmsnorm\",\n",
+    "    grad=d_out,\n",
+    "    input=x_bwd,\n",
+    "    scale=scale_bwd,\n",
+    "    inv_variance=inv_var_bwd,\n",
+    "    has_dbias=True,\n",
+    ")\n",
+    "\n",
+    "d_x.set_output(True).set_data_type(dtype)\n",
+    "d_scale.set_output(True).set_data_type(dtype)\n",
+    "d_bias.set_output(True).set_data_type(dtype);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the bwd_graph\n",
+    "bwd_graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "# Create output buffers for gradients\n",
+    "d_x_gpu = torch.empty_like(x_gpu)\n",
+    "d_scale_gpu = torch.empty_like(scale_gpu)\n",
+    "d_bias_gpu = torch.empty_like(bias_gpu)\n",
+    "\n",
+    "# Execute the graph\n",
+    "workspace = torch.empty(\n",
+    "    bwd_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8\n",
+    ")\n",
+    "bwd_graph.execute(\n",
+    "    {\n",
+    "        x_bwd: x_gpu.detach(),\n",
+    "        scale_bwd: scale_gpu.detach(),\n",
+    "        d_out: out_ref.grad,\n",
+    "        inv_var_bwd: inv_var_gpu.detach(),\n",
+    "        d_x: d_x_gpu,\n",
+    "        d_scale: d_scale_gpu,\n",
+    "        d_bias: d_bias_gpu,\n",
+    "    },\n",
+    "    workspace,\n",
+    "    handle=handle,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare results and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "torch.testing.assert_close(x_gpu.grad, d_x_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(scale_gpu.grad, d_scale_gpu, atol=2e-4, rtol=2e-4)\n",
+    "torch.testing.assert_close(bias_gpu.grad, d_bias_gpu, atol=2e-4, rtol=2e-4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/30_batchnorm.ipynb b/third_party/cudnn-frontend/samples/python/30_batchnorm.ipynb
new file mode 100644
index 00000000..05ab3c67
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/30_batchnorm.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BatchNorm Operation\n",
+    "\n",
+    "This notebook shows how to compute a batchnorm forward operation using cuDNN.\n",
+    "\n",
+    "$$\\text{BatchNorm}(x) = \\frac{x-\\mu}{\\sqrt{\\sigma^2 + \\epsilon}}\\cdot\\gamma+\\beta$$\n",
+    "\n",
+    "Where $\\mu = E[x]$ and $\\sigma^2 = Var[x]$ are taken over all inputs in a channel."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/29_batchnorm.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the RMS norm forward pass. First we define the batch size, number of channels, spatial dimensions, and some other parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "print(\"Running with cudnn backend version:\", cudnn.backend_version())\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "N, C, H, W = 4, 16, 56, 56\n",
+    "epsilon_value = 1e-3\n",
+    "momentum = 0.1\n",
+    "dtype = torch.float16"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is how you can use the `Graph` wrapper to perform batch norm with the input tensors in PyTorch format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "scale_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "bias_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "running_mean_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "running_var_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "comparison_gpu = torch.zeros(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "momentum_cpu = torch.full((1, 1, 1, 1), momentum, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of batchnorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"bn_fwd::input\",\n",
+    "        \"bn_fwd::scale\",\n",
+    "        \"bn_fwd::bias\",\n",
+    "        \"bn_fwd::in_running_mean\",\n",
+    "        \"bn_fwd::in_running_var\",\n",
+    "        \"bn_fwd::epsilon\",\n",
+    "        \"bn_fwd::momentum\",\n",
+    "        \"cmp_gt::comparison\",\n",
+    "    ],\n",
+    "    outputs=[\"y\", \"mean\", \"inv_var\", \"run_mean\", \"run_var\", \"mask\"],\n",
+    ") as fwd_graph:\n",
+    "    out, mean, inv_var, run_mean, run_var = fwd_graph.batchnorm(\n",
+    "        name=\"bn_fwd\",\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        in_running_mean=running_mean_gpu,\n",
+    "        in_running_var=running_var_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "        momentum=momentum_cpu,\n",
+    "    )\n",
+    "    y = fwd_graph.relu(name=\"relu\", input=out)\n",
+    "    mask = fwd_graph.cmp_gt(\n",
+    "        name=\"cmp_gt\",\n",
+    "        input=y,\n",
+    "        comparison=comparison_gpu,\n",
+    "    )\n",
+    "    y.set_output(True).set_name(\"y\")\n",
+    "    mean.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name(\"mean\")\n",
+    "    inv_var.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name(\"inv_var\")\n",
+    "    run_mean.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name(\"run_mean\")\n",
+    "    run_var.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name(\"run_var\")\n",
+    "    mask.set_output(True).set_data_type(cudnn.data_type.BOOLEAN).set_name(\"mask\")\n",
+    "\n",
+    "y_gpu, out_mean_gpu, out_inv_var_gpu, running_mean_gpu, running_var_gpu, mask_gpu = (\n",
+    "    fwd_graph(\n",
+    "        x_gpu,\n",
+    "        scale_gpu,\n",
+    "        bias_gpu,\n",
+    "        running_mean_gpu,\n",
+    "        running_var_gpu,\n",
+    "        eps_cpu,\n",
+    "        momentum_cpu,\n",
+    "        comparison_gpu,\n",
+    "        handle=handle,\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is a more sophisticated graph in that the output of the batch norm has been processed to create two other tensors, `y` and `mask`. There are multiple nodes in this graph. Let's compare the output with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch equivalent of forward pass, and the stats of this batch\n",
+    "out_ref = torch.nn.functional.batch_norm(\n",
+    "    x_gpu,\n",
+    "    running_mean_gpu,\n",
+    "    running_var_gpu,\n",
+    "    weight=scale_gpu,\n",
+    "    bias=bias_gpu,\n",
+    "    training=True,\n",
+    "    momentum=momentum_cpu.item(),\n",
+    "    eps=eps_cpu.item(),\n",
+    ")\n",
+    "mean_ref = torch.mean(x_gpu.float(), dim=(0, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(\n",
+    "    torch.var(x_gpu.float(), dim=(0, 2, 3), keepdim=True) + epsilon_value\n",
+    ")\n",
+    "y_ref = torch.relu(out_ref)\n",
+    "mask_ref = y_ref > 0\n",
+    "\n",
+    "# Compare the output\n",
+    "torch.testing.assert_close(y_gpu, y_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_ref, out_mean_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_ref, out_inv_var_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Batchnorm Training Forward"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input and output tensor buffers in PyTorch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "scale_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "bias_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "running_mean_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "running_var_gpu = torch.randn(1, C, 1, 1, device=\"cuda\")\n",
+    "comparison_gpu = torch.zeros(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "momentum_cpu = torch.full((1, 1, 1, 1), momentum, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# output tensors\n",
+    "saved_mean_gpu = torch.empty_like(running_mean_gpu, device=\"cuda\")\n",
+    "saved_inv_var_gpu = torch.empty_like(running_var_gpu, device=\"cuda\")\n",
+    "y_gpu = torch.empty_like(x_gpu, dtype=dtype, device=\"cuda\")\n",
+    "mask_gpu = torch.empty_like(x_gpu, dtype=torch.bool, device=\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create cuDNN graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    handle=handle,\n",
+    ")\n",
+    "\n",
+    "x = graph.tensor_like(x_gpu)\n",
+    "scale = graph.tensor_like(scale_gpu)\n",
+    "bias = graph.tensor_like(bias_gpu)\n",
+    "\n",
+    "in_running_mean = graph.tensor_like(running_mean_gpu)\n",
+    "in_running_var = graph.tensor_like(running_var_gpu)\n",
+    "epsilon = graph.tensor_like(eps_cpu)\n",
+    "momentum = graph.tensor_like(momentum_cpu)\n",
+    "comparison = graph.tensor_like(x_gpu)\n",
+    "\n",
+    "y_before_relu, saved_mean, saved_inv_var, out_running_mean, out_running_var = (\n",
+    "    graph.batchnorm(\n",
+    "        name=\"BN\",\n",
+    "        input=x,\n",
+    "        scale=scale,\n",
+    "        bias=bias,\n",
+    "        in_running_mean=in_running_mean,\n",
+    "        in_running_var=in_running_var,\n",
+    "        epsilon=epsilon,\n",
+    "        momentum=momentum,\n",
+    "    )\n",
+    ")\n",
+    "y = graph.relu(name=\"relu\", input=y_before_relu)\n",
+    "mask = graph.cmp_gt(name=\"cmp\", input=y, comparison=comparison)\n",
+    "\n",
+    "y.set_output(True)\n",
+    "saved_mean.set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "saved_inv_var.set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "out_running_mean.set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "out_running_var.set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "mask.set_output(True).set_data_type(cudnn.data_type.BOOLEAN)\n",
+    "pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    x: x_gpu,\n",
+    "    scale: scale_gpu,\n",
+    "    bias: bias_gpu,\n",
+    "    in_running_mean: running_mean_gpu,\n",
+    "    in_running_var: running_var_gpu,\n",
+    "    epsilon: eps_cpu,\n",
+    "    momentum: momentum_cpu,\n",
+    "    out_running_mean: running_mean_gpu,\n",
+    "    out_running_var: running_var_gpu,\n",
+    "    saved_mean: saved_mean_gpu,\n",
+    "    saved_inv_var: saved_inv_var_gpu,\n",
+    "    y: y_gpu,\n",
+    "    comparison: comparison_gpu,\n",
+    "    mask: mask_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(\n",
+    "    variant_pack,\n",
+    "    workspace,\n",
+    "    handle=handle,\n",
+    ")\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_ref = x_gpu.clone().float()\n",
+    "running_mean_ref = running_mean_gpu.clone().float()\n",
+    "running_var_ref = running_var_gpu.clone().float()\n",
+    "\n",
+    "y_before_relu_ref = torch.nn.functional.batch_norm(\n",
+    "    x_ref,\n",
+    "    running_mean_ref,  # running_mean is both input and output\n",
+    "    running_var_ref,  # running_var is both input and output\n",
+    "    weight=scale_gpu,\n",
+    "    bias=bias_gpu,\n",
+    "    training=True,\n",
+    "    momentum=momentum_cpu.item(),\n",
+    "    eps=eps_cpu.item(),\n",
+    ")\n",
+    "\n",
+    "mean_ref = torch.mean(x_ref, dim=(0, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.var(x_ref, dim=(0, 2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(inv_var_ref + epsilon_value)\n",
+    "y_ref = torch.relu(y_before_relu_ref)\n",
+    "mask_ref = y_ref > 0\n",
+    "\n",
+    "torch.testing.assert_close(y_ref, y_gpu.float(), atol=1e-3, rtol=1e-3)\n",
+    "torch.testing.assert_close(mean_ref, saved_mean_gpu.float(), atol=1e-3, rtol=1e-3)\n",
+    "torch.testing.assert_close(inv_var_ref, saved_inv_var_gpu.float(), atol=1e-3, rtol=1e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/31_instancenorm.ipynb b/third_party/cudnn-frontend/samples/python/31_instancenorm.ipynb
new file mode 100644
index 00000000..3a4f85f7
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/31_instancenorm.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Instance Norm Operation\n",
+    "\n",
+    "This notebook shows how to compute an instance Norm using the cuDNN python frontend.\n",
+    "\n",
+    "$$\\text{InstanceNorm(x)} = \\frac{x - \\mathbb{E}(x)}{\\sqrt{Var(x)+\\epsilon}}\\cdot \\gamma + \\beta$$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/30_instancenorm.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the instance norm forward pass. First we define the batch size, number of channels, spatial dimensions, and some other parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "N, C, H, W = 16, 32, 64, 64\n",
+    "dtype = torch.float16\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Below is how you can use the `Graph` wrapper to perform instance norm with the input tensors in PyTorch format."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "scale_gpu = torch.randn(1, C, 1, 1, device=\"cuda\").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(1, C, 1, 1, device=\"cuda\").to(memory_format=torch.channels_last)\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# forward pass of batchnorm using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"in_fwd::input\", \"in_fwd::scale\", \"in_fwd::bias\", \"in_fwd::epsilon\"],\n",
+    "    outputs=[\"out\", \"mean\", \"inv_var\"],\n",
+    ") as fwd_graph:\n",
+    "    out, mean, inv_var = fwd_graph.instancenorm(\n",
+    "        name=\"in_fwd\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "        input=x_gpu,\n",
+    "        scale=scale_gpu,\n",
+    "        bias=bias_gpu,\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    out.set_output(True).set_data_type(dtype).set_name(\"out\")\n",
+    "    mean.set_output(True).set_data_type(dtype).set_name(\"mean\")\n",
+    "    inv_var.set_output(True).set_data_type(dtype).set_name(\"inv_var\")\n",
+    "\n",
+    "out_gpu, mean_gpu, inv_var_gpu = fwd_graph(\n",
+    "    x_gpu, scale_gpu, bias_gpu, eps_cpu, handle=handle\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can verify the correctness with PyTorch API:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch forward pass, and the stats of this batch\n",
+    "out_ref = torch.nn.functional.instance_norm(\n",
+    "    x_gpu,\n",
+    "    weight=scale_gpu.view(C),\n",
+    "    bias=bias_gpu.view(C),\n",
+    ")\n",
+    "mean_ref = torch.mean(x_gpu, dim=(2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(torch.var(x_gpu, dim=(2, 3), keepdim=True) + epsilon_value)\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create input tensor GPU buffers. We use PyTorch to allocate GPU tensors so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "scale_gpu = torch.randn(1, C, 1, 1, device=\"cuda\").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(1, C, 1, 1, device=\"cuda\").to(memory_format=torch.channels_last)\n",
+    "# set epsilon to epsilon_value, allocate on cpu.\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create cuDNN graph and cuDNN tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create cuDNN graph\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# create tensor handles with the graph API\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = graph.tensor_like(eps_cpu).set_name(\"epsilon\")\n",
+    "\n",
+    "out, mean, inv_var = graph.instancenorm(\n",
+    "    name=\"instancenorm\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.TRAINING,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "\n",
+    "# enable all outputs\n",
+    "out.set_name(\"output\").set_output(True).set_data_type(dtype)\n",
+    "mean.set_name(\"mean\").set_output(True).set_data_type(dtype)\n",
+    "inv_var.set_name(\"inv_var\").set_output(True).set_data_type(dtype);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build and execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "# Allocate output tensors\n",
+    "out_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "mean_gpu = torch.randn(N, C, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "inv_var_gpu = torch.randn(N, C, 1, 1, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "# Execute the graph\n",
+    "variant_pack = {\n",
+    "    x: x_gpu,\n",
+    "    scale: scale_gpu,\n",
+    "    bias: bias_gpu,\n",
+    "    epsilon: eps_cpu,\n",
+    "    out: out_gpu,\n",
+    "    mean: mean_gpu,\n",
+    "    inv_var: inv_var_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compute reference ouputs using PyTorch and compare with cuDNN graph outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out_ref = torch.nn.functional.instance_norm(\n",
+    "    x_gpu,\n",
+    "    weight=scale_gpu.view(C),\n",
+    "    bias=bias_gpu.view(C),\n",
+    ")\n",
+    "mean_ref = x_gpu.mean(dim=(2, 3), keepdim=True)\n",
+    "inv_var_ref = torch.rsqrt(torch.var(x_gpu, dim=(2, 3), keepdim=True) + epsilon_value)\n",
+    "\n",
+    "torch.testing.assert_close(out_gpu, out_ref, rtol=1e-2, atol=1e-2)\n",
+    "torch.testing.assert_close(inv_var_gpu, inv_var_ref, rtol=1e-2, atol=1e-2)\n",
+    "torch.testing.assert_close(mean_gpu, mean_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Perform Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/32_instancenorm_fusion.ipynb b/third_party/cudnn-frontend/samples/python/32_instancenorm_fusion.ipynb
new file mode 100644
index 00000000..56e4292f
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/32_instancenorm_fusion.ipynb
@@ -0,0 +1,347 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fusion Operation with Instance Norm\n",
+    "\n",
+    "This notebook shows how to compute an instance Norm (+ add + relu) using the cuDNN python frontend.\n",
+    "\n",
+    "$$y = \\text{ReLU}\\big(\\text{InstanceNorm(x)}+A\\big) = \\max\\Big(0, \\big(\\frac{x - \\mathbb{E}(x)}{\\sqrt{Var(x)+\\epsilon}}\\cdot \\gamma + \\beta\\big) + A\\Big)$$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/31_instancenorm_fusion.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we are going to perform the instance norm with the following batch size, number of channels, spatial dimensions, and some other parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(1)\n",
+    "assert torch.cuda.is_available()\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "N, C, H, W = 16, 32, 64, 64\n",
+    "dtype = torch.float16\n",
+    "# Epsilon is a small number to prevent division by 0.\n",
+    "epsilon_value = 1e-5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to use the `Graph` wrapper to compute instance norm, add, and ReLU:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensors\n",
+    "x_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "scale_gpu = torch.randn(1, C, 1, 1, device=\"cuda\").to(memory_format=torch.channels_last)\n",
+    "bias_gpu = torch.randn(1, C, 1, 1, device=\"cuda\").to(memory_format=torch.channels_last)\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "a_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "# forward pass of instance norm + add + relu using cuDNN graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\n",
+    "        \"in_fwd::input\",\n",
+    "        \"in_fwd::scale\",\n",
+    "        \"in_fwd::bias\",\n",
+    "        \"in_fwd::epsilon\",\n",
+    "        \"add::b\",\n",
+    "    ],\n",
+    "    outputs=[\"relu\"],\n",
+    ") as fwd_graph:\n",
+    "    out, mean, inv_var = fwd_graph.instancenorm(\n",
+    "        name=\"in_fwd\",\n",
+    "        norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "        input=x_gpu.detach(),\n",
+    "        scale=scale_gpu.detach(),\n",
+    "        bias=bias_gpu.detach(),\n",
+    "        epsilon=eps_cpu,\n",
+    "    )\n",
+    "    assert mean is None, \"Instance norm in inference mode should return mean=None\"\n",
+    "    assert inv_var is None, \"Instance norm in inference mode should return inv_var=None\"\n",
+    "    sum_out = fwd_graph.add(a=out, b=a_gpu, name=\"add\")\n",
+    "    relu_out = fwd_graph.relu(sum_out, name=\"relu\")\n",
+    "    relu_out.set_output(True).set_data_type(dtype).set_name(\"relu\")\n",
+    "\n",
+    "y_gpu = fwd_graph(x_gpu, scale_gpu, bias_gpu, eps_cpu, a_gpu, handle=handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then, we can verify the correctness with PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PyTorch forward pass, and the stats of this batch\n",
+    "out_ref = torch.nn.functional.instance_norm(\n",
+    "    x_gpu,\n",
+    "    weight=scale_gpu.view(C),\n",
+    "    bias=bias_gpu.view(C),\n",
+    ")\n",
+    "y_ref = torch.relu(out_ref + a_gpu)\n",
+    "mask_ref = y_ref > 0.0\n",
+    "\n",
+    "torch.testing.assert_close(y_gpu, y_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we need to create GPU buffers as input. We use PyTorch tensors here so we can reuse them easily when we calculate reference outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input tensor memory, initialize them to random numbers\n",
+    "x_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype, requires_grad=True).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "scale_gpu = torch.randn(1, C, 1, 1, device=\"cuda\", requires_grad=True).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "bias_gpu = torch.randn(1, C, 1, 1, device=\"cuda\", requires_grad=True).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "# set epsilon to epsilon_value, allocate on cpu.\n",
+    "eps_cpu = torch.full((1, 1, 1, 1), epsilon_value, dtype=torch.float32, device=\"cpu\")\n",
+    "\n",
+    "# Create tensors for fusion and intermediate ops\n",
+    "a_gpu = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "sum_gpu = torch.empty_like(a_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the graph\n",
+    "graph = cudnn.pygraph(\n",
+    "    handle=handle,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "# create tensor handles with the graph API\n",
+    "x = graph.tensor_like(x_gpu.detach()).set_name(\"X\")\n",
+    "scale = graph.tensor_like(scale_gpu.detach()).set_name(\"scale\")\n",
+    "bias = graph.tensor_like(bias_gpu.detach()).set_name(\"bias\")\n",
+    "epsilon = graph.tensor_like(eps_cpu).set_name(\"epsilon\")\n",
+    "a = graph.tensor_like(x).set_name(\"A\")\n",
+    "\n",
+    "# instance norm + add + relu\n",
+    "y, mean, inv_var = graph.instancenorm(\n",
+    "    name=\"in_fwd\",\n",
+    "    input=x,\n",
+    "    norm_forward_phase=cudnn.norm_forward_phase.INFERENCE,\n",
+    "    scale=scale,\n",
+    "    bias=bias,\n",
+    "    epsilon=epsilon,\n",
+    ")\n",
+    "sum_out = graph.add(y, a, name=\"add\")\n",
+    "sum_out.set_name(\"sum\")\n",
+    "relu = graph.relu(sum_out)\n",
+    "relu.set_name(\"sum\").set_output(True).set_data_type(cudnn.data_type.HALF)\n",
+    "\n",
+    "# Build the graph\n",
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prepare buffer for output\n",
+    "relu_gpu = torch.empty(N, C, H, W, device=\"cuda\", dtype=dtype).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "mask_gpu = torch.empty(N, C, H, W, device=\"cuda\", dtype=torch.bool).to(\n",
+    "    memory_format=torch.channels_last\n",
+    ")\n",
+    "\n",
+    "# Mapping of (handles -> memory)\n",
+    "variant_pack = {\n",
+    "    # input tensors\n",
+    "    x: x_gpu,\n",
+    "    scale: scale_gpu,\n",
+    "    bias: bias_gpu,\n",
+    "    epsilon: eps_cpu,\n",
+    "    a: a_gpu,\n",
+    "    # output tensor\n",
+    "    relu: relu_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compute reference ouputs and verify the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_ref = torch.nn.functional.instance_norm(\n",
+    "    x_gpu,\n",
+    "    weight=scale_gpu.view(C),\n",
+    "    bias=bias_gpu.view(C),\n",
+    ")\n",
+    "relu_ref = torch.nn.functional.relu(y_ref + a_gpu)\n",
+    "\n",
+    "torch.testing.assert_close(relu_gpu, relu_ref, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/third_party/cudnn-frontend/samples/python/50_sdpa_forward.ipynb b/third_party/cudnn-frontend/samples/python/50_sdpa_forward.ipynb
new file mode 100644
index 00000000..c97c7a95
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/50_sdpa_forward.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scaled Dot Product Attention (SDPA) in cuDNN Frontend\n",
+    "\n",
+    "This notebook is an example for the scaled dot product attention operator in cuDNN frontend. This operation computes scaled dot product attention as\n",
+    "\n",
+    "$$\\text{Attention}(Q, K, V) = \\text{softmax}\\left(\\frac{QK^\\top}{\\sqrt{d}}\\right)V$$\n",
+    "\n",
+    "using the FlashAttention-2 algorithm described in the paper [FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning](https://arxiv.org/abs/2307.08691). It is applicable for both training and inference phases, with an option to generate a stats tensor to be used for backwards training computation.\n",
+    "\n",
+    "The full documentation can be found in: [docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html](https://docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html#scaled-dot-product-attention-fp16-bf16-forward)\n",
+    "\n",
+    "The python test code for the full set of features can be found in: [test/python/test_mhas.py](https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_mhas.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/50_sdpa_forward.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook requires an NVIDIA GPU A100 or newer. If running on Colab, go to Runtime → Change runtime type → Hardware accelerator and select a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this example, we will use the problem size from the original GPT-2 paper where:\n",
+    "\n",
+    " - maximum sequence length = 1024\n",
+    " - hidden dim = number of heads $\\times$ embedding dimension per head = 12 $\\times$ 64 = 768"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "import cudnn\n",
+    "import torch\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    torch.cuda.get_device_capability()[0] >= 8\n",
+    "), \"SDPA operation is only supported on SM80 architecture (Ampere) or above\"\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 8903\n",
+    "), \"SDPA operation is only supported cuDNN version 8.9.3 or above\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "B = 4  # batch size\n",
+    "S = 1024  # maximum sequence length\n",
+    "H = 12  # query number of heads\n",
+    "D = 64  # embedding dimension per head\n",
+    "dtype = torch.half\n",
+    "\n",
+    "attn_scale = 1.0 / math.sqrt(D)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random input tensors: Should be in BSHD physical layout and BHSD logical layout\n",
+    "q_gpu = torch.randn(B, S, H, D, device=\"cuda\", dtype=dtype).transpose(1, 2)\n",
+    "k_gpu = torch.randn(B, S, H, D, device=\"cuda\", dtype=dtype).transpose(1, 2)\n",
+    "v_gpu = torch.randn(B, S, H, D, device=\"cuda\", dtype=dtype).transpose(1, 2)\n",
+    "\n",
+    "# create a graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    inputs=[\"SDPA::q\", \"SDPA::k\", \"SDPA::v\"],\n",
+    "    outputs=[\"attn_output\"],\n",
+    ") as graph:\n",
+    "    o, _ = graph.sdpa(\n",
+    "        name=\"SDPA\",\n",
+    "        q=q_gpu,\n",
+    "        k=k_gpu,\n",
+    "        v=v_gpu,\n",
+    "        attn_scale=attn_scale,\n",
+    "        is_inference=True,\n",
+    "        use_causal_mask=True,\n",
+    "    )\n",
+    "    o.set_output(True).set_name(\"attn_output\").set_dim(q_gpu.shape).set_stride(\n",
+    "        q_gpu.stride()\n",
+    "    )\n",
+    "\n",
+    "# execute the graph\n",
+    "o_gpu = graph(q_gpu, k_gpu, v_gpu, handle=handle)\n",
+    "\n",
+    "# verify the result with PyTorch operations\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "    q_gpu, k_gpu, v_gpu, is_causal=True, scale=attn_scale\n",
+    ")\n",
+    "torch.testing.assert_close(o_ref, o_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the query, key, value, and output GPU tensors using PyTorch. However, the user may use any DLPack compatible tensor instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The tensors will have non-interleaved\n",
+    "# BHSD (batch, num_head, sequence_length, dims_per_head) logical tensor layout\n",
+    "dims = (B, H, S, D)\n",
+    "# BSHD (batch, sequence_length, num_head, dims_per_head) physical layout\n",
+    "strides = (S * H * D, D, H * D, 1)\n",
+    "# For BHSD (batch, num_head, sequence_length, dims_per_head) physical tensor layout, uncomment the following:\n",
+    "# strides = (S*H*D, S*D, D, 1)\n",
+    "\n",
+    "q_gpu = torch.randn(B, S, H, D, device=\"cuda\", dtype=dtype).transpose(1, 2)\n",
+    "k_gpu = torch.randn(B, S, H, D, device=\"cuda\", dtype=dtype).transpose(1, 2)\n",
+    "v_gpu = torch.randn(B, S, H, D, device=\"cuda\", dtype=dtype).transpose(1, 2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q = graph.tensor_like(q_gpu)\n",
+    "k = graph.tensor_like(k_gpu)\n",
+    "v = graph.tensor_like(v_gpu)\n",
+    "\n",
+    "# the second return for the stats tensor is used for training only.\n",
+    "# causal mask is enabled\n",
+    "o, _ = graph.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q,\n",
+    "    k=k,\n",
+    "    v=v,\n",
+    "    generate_stats=False,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "\n",
+    "o.set_output(True).set_dim(dims).set_stride(strides)\n",
+    "pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate output tensor\n",
+    "o_gpu = torch.empty(B, H, S, D, device=\"cuda\", dtype=dtype).as_strided(dims, strides)\n",
+    "\n",
+    "# execute the graph\n",
+    "variant_pack = {\n",
+    "    q: q_gpu,\n",
+    "    k: k_gpu,\n",
+    "    v: v_gpu,\n",
+    "    o: o_gpu,\n",
+    "}\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q_ref = q_gpu.detach().float().requires_grad_()\n",
+    "k_ref = k_gpu.detach().float().requires_grad_()\n",
+    "v_ref = v_gpu.detach().float().requires_grad_()\n",
+    "\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "    q_ref, k_ref, v_ref, is_causal=True, scale=attn_scale\n",
+    ")\n",
+    "torch.testing.assert_close(o_ref, o_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "\n",
+    "\n",
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/51_sdpa_backward.ipynb b/third_party/cudnn-frontend/samples/python/51_sdpa_backward.ipynb
new file mode 100644
index 00000000..bc3716b6
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/51_sdpa_backward.ipynb
@@ -0,0 +1,528 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scaled Dot Product Attention (SDPA) Backward in cuDNN Frontend\n",
+    "\n",
+    "This operation computes gradient tensors for scaled dot product attention using the FlashAttention-2 algorithm as described in the paper FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. The user is required to pass the stats tensor from the forward operation to the backward operation as input.\n",
+    "\n",
+    "The full documentation can be found in: [docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html](https://docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html#scaled-dot-product-attention-fp16-bf16-backward)\n",
+    "\n",
+    "The python test code for the full set of features can be found in: [test/python/test_mhas.py](https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_mhas.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/51_sdpa_backward.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU A100 or newer. If running on Colab, go to Runtime → Change runtime type → Hardware accelerator and select a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For this example, we will use the problem size from the original GPT-2 paper where:\n",
+    " - maximum sequence length = 1024\n",
+    " - hidden dim = number of heads $\\times$ embedding dimension per head = 12 $\\times$ 64 = 768"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import math\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "\n",
+    "handle = cudnn.create_handle()\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    torch.cuda.get_device_capability()[0] >= 8\n",
+    "), \"SDPA operation is only supported on SM80 architecture (Ampere) or above\"\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 8903\n",
+    "), \"SDPA operation is only supported cuDNN version 8.9.3 or above\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "B = 4  # batch size\n",
+    "S = 1024  # maximum sequence length\n",
+    "H = 12  # query number of heads\n",
+    "D = 64  # embedding dimension per head\n",
+    "dtype = torch.half\n",
+    "\n",
+    "attn_scale = 1.0 / math.sqrt(D)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Wrapper"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Forward pass graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random input tensors: BSHD physical layout and BHSD logical layout\n",
+    "q_gpu = torch.randn(\n",
+    "    B, S, H, D, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").transpose(1, 2)\n",
+    "k_gpu = torch.randn(\n",
+    "    B, S, H, D, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").transpose(1, 2)\n",
+    "v_gpu = torch.randn(\n",
+    "    B, S, H, D, device=\"cuda\", dtype=dtype, requires_grad=True\n",
+    ").transpose(1, 2)\n",
+    "\n",
+    "# Forward graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    workspace_alloc=False,\n",
+    "    inputs=[\"SDPA::q\", \"SDPA::k\", \"SDPA::v\"],\n",
+    "    outputs=[\"output\", \"stats\"],\n",
+    ") as fwd_graph:\n",
+    "    o, stats = fwd_graph.sdpa(\n",
+    "        name=\"SDPA\",\n",
+    "        q=q_gpu,\n",
+    "        k=k_gpu,\n",
+    "        v=v_gpu,\n",
+    "        attn_scale=attn_scale,\n",
+    "        is_inference=False,\n",
+    "        use_causal_mask=True,\n",
+    "    )\n",
+    "    o.set_output(True).set_dim(q_gpu.shape).set_stride(q_gpu.stride()).set_name(\n",
+    "        \"output\"\n",
+    "    )\n",
+    "    stats.set_output(True).set_data_type(cudnn.data_type.FLOAT).set_name(\"stats\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Backward pass graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# allocate random tensor in place of the gradients\n",
+    "dO_gpu = torch.randn_like(q_gpu)\n",
+    "# allocate random tensor in place of the output tensors from forward graph to\n",
+    "# help creating the backward graph\n",
+    "o_gpu = torch.randn_like(q_gpu)\n",
+    "stats_gpu = torch.randn(B, H, S, 1, device=\"cuda\", dtype=torch.float32)\n",
+    "\n",
+    "# define the backward graph\n",
+    "with cudnn.Graph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    "    workspace_alloc=False,\n",
+    "    inputs=[\n",
+    "        \"d_sdpa::q\",\n",
+    "        \"d_sdpa::k\",\n",
+    "        \"d_sdpa::v\",\n",
+    "        \"d_sdpa::o\",\n",
+    "        \"d_sdpa::stats\",\n",
+    "        \"d_sdpa::dO\",\n",
+    "    ],\n",
+    "    outputs=[\"dQ\", \"dK\", \"dV\"],\n",
+    ") as bwd_graph:\n",
+    "    dQ, dK, dV = bwd_graph.sdpa_backward(\n",
+    "        name=\"d_sdpa\",\n",
+    "        q=q_gpu,\n",
+    "        k=k_gpu,\n",
+    "        v=v_gpu,\n",
+    "        o=o_gpu,\n",
+    "        dO=dO_gpu,\n",
+    "        stats=stats_gpu,\n",
+    "        attn_scale=attn_scale,\n",
+    "        use_causal_mask=True,\n",
+    "    )\n",
+    "    dQ.set_output(True).set_dim(q_gpu.shape).set_stride(q_gpu.stride()).set_name(\"dQ\")\n",
+    "    dK.set_output(True).set_dim(k_gpu.shape).set_stride(k_gpu.stride()).set_name(\"dK\")\n",
+    "    dV.set_output(True).set_dim(v_gpu.shape).set_stride(v_gpu.stride()).set_name(\"dV\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Execute the graphs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we reuse the same workspace for both forward and backward graphs to save memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create workspace as maximum size between the forward and backward graphs\n",
+    "workspace_size = max(fwd_graph.get_workspace_size(), bwd_graph.get_workspace_size())\n",
+    "workspace = torch.empty(workspace_size, device=\"cuda\", dtype=torch.uint8)\n",
+    "\n",
+    "# execute the forward graph\n",
+    "o_gpu, stats_gpu = fwd_graph(q_gpu, k_gpu, v_gpu, workspace=workspace, handle=handle)\n",
+    "\n",
+    "# verify the forward result with PyTorch operations\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "    q_gpu, k_gpu, v_gpu, is_causal=True, scale=attn_scale\n",
+    ")\n",
+    "torch.testing.assert_close(o_ref, o_gpu, atol=5e-3, rtol=3e-3)\n",
+    "\n",
+    "# execute the backward graph\n",
+    "dQ_gpu, dK_gpu, dV_gpu = bwd_graph(\n",
+    "    q_gpu, k_gpu, v_gpu, o_gpu, stats_gpu, dO_gpu, workspace=workspace, handle=handle\n",
+    ")\n",
+    "\n",
+    "# verify the backward result with PyTorch operations\n",
+    "dQ_ref, dK_ref, dV_ref = torch.autograd.grad(\n",
+    "    outputs=[o_ref], inputs=[q_gpu, k_gpu, v_gpu], grad_outputs=[dO_gpu]\n",
+    ")\n",
+    "torch.testing.assert_close(dQ_ref, dQ_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(dK_ref, dK_gpu, atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(dV_ref, dV_gpu, atol=5e-3, rtol=3e-3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using Python Binding APIs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the query, key, value, and output GPU tensors using PyTorch.\n",
+    "\n",
+    "**However for backwards computation, we also need to pass the stats tensor between the forward graph and the backward graph.**\n",
+    "\n",
+    "The stats tensor should have dims $(B, H, S, 1)$ and float32 datatype."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The tensors will have non-interleaved\n",
+    "# BSHD (batch, sequence_length, num_head, dims_per_head) physical tensor layout\n",
+    "# BHSD (batch, num_head, sequence_length, dims_per_head) logical tensor layout\n",
+    "dims = (B, H, S, D)\n",
+    "strides = (S * H * D, D, H * D, 1)\n",
+    "\n",
+    "# input tensors for the forward pass\n",
+    "q_gpu = torch.randn(B * S * H * D).half().cuda().as_strided(dims, strides)\n",
+    "k_gpu = torch.randn(B * S * H * D).half().cuda().as_strided(dims, strides)\n",
+    "v_gpu = torch.randn(B * S * H * D).half().cuda().as_strided(dims, strides)\n",
+    "# preallocated output tensors for the forward pass\n",
+    "o_gpu = torch.empty(B * S * H * D).half().cuda().as_strided(dims, strides)\n",
+    "stats_gpu = torch.empty(B, H, S, 1).float().cuda()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the forward graph and build"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph_forward = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q_forward = graph_forward.tensor_like(q_gpu)\n",
+    "k_forward = graph_forward.tensor_like(k_gpu)\n",
+    "v_forward = graph_forward.tensor_like(v_gpu)\n",
+    "\n",
+    "# training mode is enabled with generate_stats=True\n",
+    "# causal mask is enabled\n",
+    "o_forward, stats_forward = graph_forward.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q_forward,\n",
+    "    k=k_forward,\n",
+    "    v=v_forward,\n",
+    "    generate_stats=True,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "\n",
+    "o_forward.set_output(True).set_dim(o_gpu.size()).set_stride(o_gpu.stride())\n",
+    "stats_forward.set_output(True).set_dim(stats_gpu.size()).set_stride(stats_gpu.stride())\n",
+    "stats_forward.set_data_type(cudnn.data_type.FLOAT)\n",
+    "\n",
+    "graph_forward.validate()\n",
+    "graph_forward.build_operation_graph()\n",
+    "graph_forward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph_forward.check_support()\n",
+    "graph_forward.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also create the query, key, value, and output gradient tensors to be used for backwards computation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# note: torch 'like' preserves the strided layout\n",
+    "dO_gpu = torch.randn_like(o_gpu)\n",
+    "dQ_gpu = torch.empty_like(q_gpu)\n",
+    "dK_gpu = torch.empty_like(k_gpu)\n",
+    "dV_gpu = torch.empty_like(v_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the backward graph and build"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph_backward = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q_backward = graph_backward.tensor_like(q_gpu)\n",
+    "k_backward = graph_backward.tensor_like(k_gpu)\n",
+    "v_backward = graph_backward.tensor_like(v_gpu)\n",
+    "o_backward = graph_backward.tensor_like(o_gpu)\n",
+    "dO_backward = graph_backward.tensor_like(dO_gpu)\n",
+    "stats_backward = graph_backward.tensor_like(stats_gpu)\n",
+    "\n",
+    "dQ_backward, dK_backward, dV_backward = graph_backward.sdpa_backward(\n",
+    "    name=\"sdpa_backward\",\n",
+    "    q=q_backward,\n",
+    "    k=k_backward,\n",
+    "    v=v_backward,\n",
+    "    o=o_backward,\n",
+    "    dO=dO_backward,\n",
+    "    stats=stats_backward,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "\n",
+    "dQ_backward.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride())\n",
+    "dK_backward.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride())\n",
+    "dV_backward.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride())\n",
+    "\n",
+    "graph_backward.validate()\n",
+    "graph_backward.build_operation_graph()\n",
+    "graph_backward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph_backward.check_support()\n",
+    "graph_backward.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Allocate workspace required to execute. We take the maximum since forward and backward are executed sequentially."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workspace_size = max(\n",
+    "    graph_forward.get_workspace_size(),\n",
+    "    graph_backward.get_workspace_size(),\n",
+    ")\n",
+    "workspace = torch.empty(workspace_size, device=\"cuda\", dtype=torch.uint8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute forward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack_forward = {\n",
+    "    q_forward: q_gpu,\n",
+    "    k_forward: k_gpu,\n",
+    "    v_forward: v_gpu,\n",
+    "    o_forward: o_gpu,\n",
+    "    stats_forward: stats_gpu,\n",
+    "}\n",
+    "\n",
+    "graph_forward.execute(variant_pack_forward, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute backward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack_backward = {\n",
+    "    q_backward: q_gpu,\n",
+    "    k_backward: k_gpu,\n",
+    "    v_backward: v_gpu,\n",
+    "    o_backward: o_gpu,\n",
+    "    dO_backward: dO_gpu,\n",
+    "    stats_backward: stats_gpu,\n",
+    "    dQ_backward: dQ_gpu,\n",
+    "    dK_backward: dK_gpu,\n",
+    "    dV_backward: dV_gpu,\n",
+    "}\n",
+    "\n",
+    "graph_backward.execute(variant_pack_backward, workspace, handle=handle)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q_ref = q_gpu.detach().float().requires_grad_()\n",
+    "k_ref = k_gpu.detach().float().requires_grad_()\n",
+    "v_ref = v_gpu.detach().float().requires_grad_()\n",
+    "dO_ref = dO_gpu.detach().float()\n",
+    "\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "    q_ref, k_ref, v_ref, is_causal=True, scale=attn_scale\n",
+    ")\n",
+    "torch.testing.assert_close(o_ref, o_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "\n",
+    "dQ_ref, dK_ref, dV_ref = torch.autograd.grad(\n",
+    "    outputs=[o_ref], inputs=[q_ref, k_ref, v_ref], grad_outputs=[dO_ref]\n",
+    ")\n",
+    "torch.testing.assert_close(dQ_ref, dQ_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(dK_ref, dK_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(dV_ref, dV_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "\n",
+    "\n",
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/52_sdpa_with_paged_caches.ipynb b/third_party/cudnn-frontend/samples/python/52_sdpa_with_paged_caches.ipynb
new file mode 100644
index 00000000..a8a2ca24
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/52_sdpa_with_paged_caches.ipynb
@@ -0,0 +1,505 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SDPA Prefill with Paged Attention\n",
+    "\n",
+    "This notebook illustrates how the cuDNN's frontend scaled dot product attention operator can be used with paged K/V caches, specifically for prefill. For a simpler introduction to the scaled dot product attention operator, please refer to [samples/python/50_scaled_dot_product_attention.ipynb](https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/python/50_scaled_dot_product_attention.ipynb)\n",
+    "\n",
+    "The full documentation of cuDNN's scaled dot production attention operator can be found in: [docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html](https://docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html#scaled-dot-product-attention-fp16-bf16-forward).\n",
+    "\n",
+    "The python test code for the full set of features can be found in: [test/python/test_mhas.py](https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_mhas.py)\n",
+    "\n",
+    "More details on paged attention can be found in the [PagedAttention paper](https://arxiv.org/abs/2309.06180).\n",
+    "\n",
+    "This notebook specifically illustrates the following:\n",
+    "- SDPA Prefill\n",
+    "- Variable sequence lengths\n",
+    "- Q-tensor in a dense format\n",
+    "- Paged Attention\n",
+    "- Running the same graph with variable sequence lengths"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/52_sdpa_with_paged_caches.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU A100 or newer. If running on Colab, go to Runtime → Change runtime type → Hardware accelerator and select a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import math\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    torch.cuda.get_device_capability()[0] >= 8\n",
+    "), \"SDPA operation is only supported on SM80 architecture (Ampere) or above\"\n",
+    "\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 90500\n",
+    "), \"SDPA operation is only supported cuDNN version 9.5.0 or above\"\n",
+    "\n",
+    "# An issue is preventing packed Q-tensors in versions prior to 9.10.0\n",
+    "packed_Q = cudnn.backend_version() >= 91000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Problem sizes and Q/K/V setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the query, key, value, and output GPU tensors using PyTorch. However, the user may use any DLPack compatible tensor instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = 4  # batch size\n",
+    "h_q = 12  # query number of heads\n",
+    "h_kv = 12  # key and value number of heads\n",
+    "s_q = 128  # maximum sequence length for Q\n",
+    "s_kv = 128  # maximum sequence length for K/V\n",
+    "d = 64  # embedding dimension per head\n",
+    "\n",
+    "block_size_k = block_size_v = (\n",
+    "    8  # block size to be used by the non contiguous K/V containers\n",
+    ")\n",
+    "attn_scale = 1.0 / math.sqrt(d)\n",
+    "\n",
+    "# BHSD (batch, sequence_length, num_head, dims_per_head) logcial tensor layout\n",
+    "dims_q = (b, h_q, s_q, d)\n",
+    "dims_kv = (b, h_kv, s_kv, d)\n",
+    "# BSHD physical tensor layout (this is required for packed Q-tensors)\n",
+    "strides_q = (s_q * h_q * d, d, h_q * d, 1)\n",
+    "strides_kv = (s_kv * h_kv * d, d, h_kv * d, 1)\n",
+    "\n",
+    "# Randomly initialize the query, key, and value tensors.\n",
+    "q_gpu = torch.randn(b * s_q * h_q * d).half().cuda().as_strided(dims_q, strides_q)\n",
+    "k_gpu = torch.randn(b * s_kv * h_kv * d).half().cuda().as_strided(dims_kv, strides_kv)\n",
+    "v_gpu = torch.randn(b * s_kv * h_kv * d).half().cuda().as_strided(dims_kv, strides_kv)\n",
+    "o_gpu = torch.empty(b * s_q * h_q * d).half().cuda().as_strided(dims_q, strides_q)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Setup actual sequence lengths and ragged offsets\n",
+    "While we defined `s_q` as the maximum sequence length for Q and `s_kv` as the maximum sequence length for K/V, not all sequences have the same length. Therefore we specify actual sequence lengths in this section.\n",
+    "\n",
+    "While optional for Q, actual sequence lengths are required when using paged K/V caches. To keep things simple, we set initialize the actual sequence lengths for KV to the maximum sequence length `s` in this example, but we will specify random sequence lengths for Q.\n",
+    "\n",
+    "Lastly, when Q is in a packed format, we also need to create a ragged offset tensor. This is a tensor that indicates the start of each sample."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# @brief Helper function to return variable sequence lengths, along with a ragged offset tensor inidicating the start of each sequence\n",
+    "def create_variable_seq_lens(b, s):\n",
+    "    seq_len_q_gpu = torch.randint(1, s, (b, 1, 1, 1), device=\"cuda\")\n",
+    "\n",
+    "    # Create a [b+1, 1, 1, 1] ragged offset tensor\n",
+    "    q_ragged_offset_gpu = (\n",
+    "        torch.cat(\n",
+    "            (\n",
+    "                torch.zeros(1, 1, 1, 1, dtype=torch.int32, device=\"cuda\"),\n",
+    "                torch.cumsum(seq_len_q_gpu, dim=0),\n",
+    "            )\n",
+    "        )\n",
+    "        * h_q\n",
+    "        * d\n",
+    "    )\n",
+    "    return seq_len_q_gpu, q_ragged_offset_gpu\n",
+    "\n",
+    "\n",
+    "# For Q, randomly generate sequence lengths between [1,s)\n",
+    "seq_len_q_gpu, q_ragged_offset_gpu = create_variable_seq_lens(b, s_q)\n",
+    "\n",
+    "# For KV, set to s for all batches, just to keep this notebook sample simple\n",
+    "seq_len_kv_gpu = torch.full((b, 1, 1, 1), s_kv, device=\"cuda\")\n",
+    "\n",
+    "print(seq_len_q_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####  Generate containers and block tables for K and V\n",
+    "\n",
+    "In a real world scenario, container and block table tensors are generated by other parts of the model. For illustration purposes in this example, we provide a helper function to generate a trivial container from contiguous K and V caches. \n",
+    "The helper function basically takes e.g., the K-cache and splits up the sequence (`S`) dimension in different blocks of length `block_size`. The resulting block table then helps identify which block belongs to which sequence ID."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# @brief Helper function to create a non contiguous container in blocks of block_size from a contiguous tensor\n",
+    "def create_container_and_block_table(tensor, block_size):\n",
+    "    B, H, S, D = tensor.shape\n",
+    "    # num_blocks = math.ceil(S/block_size) * B\n",
+    "    blocks_per_batch = math.ceil(S / block_size)\n",
+    "\n",
+    "    # Only needed if S is not a multiple of block_size\n",
+    "    padding_seq = (blocks_per_batch * block_size) - S\n",
+    "    if padding_seq > 0:\n",
+    "        zeros = torch.zeros(B, H, padding_seq, D, device=\"cuda\", dtype=tensor.dtype)\n",
+    "        cat_tensor = torch.cat((tensor, zeros), axis=2)\n",
+    "    else:\n",
+    "        cat_tensor = tensor\n",
+    "\n",
+    "    # Create a container by splitting on the S dimension and concatenating at the block dimension\n",
+    "    # Its dimensions are [num_blocks, H, block_size, D] with num_blocks = B * blocks_per_batch\n",
+    "    container = torch.cat((cat_tensor.clone()).chunk(blocks_per_batch, dim=2), dim=0)\n",
+    "\n",
+    "    # Create the block table\n",
+    "    table_size = math.ceil(S / block_size)\n",
+    "    block_table_temp = torch.linspace(\n",
+    "        0, B * table_size - 1, B * table_size, device=\"cuda\", dtype=torch.int32\n",
+    "    ).reshape(table_size, 1, B, 1)\n",
+    "    block_table_temp = torch.transpose(block_table_temp, 0, 2)\n",
+    "\n",
+    "    # Make batch size outer dimension (cuDNN backend preference)\n",
+    "    block_table = (\n",
+    "        torch.zeros(blocks_per_batch * B)\n",
+    "        .int()\n",
+    "        .cuda()\n",
+    "        .as_strided(\n",
+    "            (B, 1, blocks_per_batch, 1), (blocks_per_batch, blocks_per_batch, 1, 1)\n",
+    "        )\n",
+    "    )\n",
+    "    block_table.copy_(block_table_temp)\n",
+    "\n",
+    "    return (container, block_table)\n",
+    "\n",
+    "\n",
+    "# Create non contiguous containers with block tables for K and V from the contiguous k_gpu and v_gpu\n",
+    "container_k_gpu, block_table_k_gpu = create_container_and_block_table(\n",
+    "    k_gpu, block_size_k\n",
+    ")\n",
+    "container_v_gpu, block_table_v_gpu = create_container_and_block_table(\n",
+    "    v_gpu, block_size_v\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Graph creation\n",
+    "\n",
+    "Create the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q = graph.tensor_like(q_gpu)\n",
+    "\n",
+    "if packed_Q:\n",
+    "    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu)\n",
+    "    q.set_ragged_offset(\n",
+    "        q_ragged_offset\n",
+    "    )  # With Q in a packed layout, we need to indicate the ragged offset\n",
+    "\n",
+    "container_k = graph.tensor_like(container_k_gpu)\n",
+    "container_v = graph.tensor_like(container_v_gpu)\n",
+    "block_table_k = graph.tensor_like(block_table_k_gpu)\n",
+    "block_table_v = graph.tensor_like(block_table_v_gpu)\n",
+    "\n",
+    "seq_len_q = graph.tensor_like(seq_len_q_gpu)\n",
+    "seq_len_kv = graph.tensor_like(seq_len_kv_gpu)\n",
+    "\n",
+    "o, _ = graph.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q,\n",
+    "    k=container_k,  # Container K: non contiguous container with K blocks\n",
+    "    v=container_v,  # Container V: non contiguous container with V blocks\n",
+    "    is_inference=False,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    "    use_padding_mask=True,\n",
+    "    seq_len_q=seq_len_q,\n",
+    "    seq_len_kv=seq_len_kv,\n",
+    "    paged_attention_k_table=block_table_k,  # Block Table K: Tensor containing offsets to the container with K blocks\n",
+    "    paged_attention_v_table=block_table_v,  # Block Table V: Tensor containing offsets to the container with V blocks\n",
+    "    paged_attention_max_seq_len_kv=s_kv,  # The maximum sequence length for K caches (this is optional, but recommended)\n",
+    ")\n",
+    "\n",
+    "o.set_output(True).set_dim(dims_q).set_stride(strides_q)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Graph Execution\n",
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    q: q_gpu,\n",
+    "    container_k: container_k_gpu,\n",
+    "    container_v: container_v_gpu,\n",
+    "    block_table_k: block_table_k_gpu,\n",
+    "    block_table_v: block_table_v_gpu,\n",
+    "    seq_len_q: seq_len_q_gpu,\n",
+    "    seq_len_kv: seq_len_kv_gpu,\n",
+    "    o: o_gpu,\n",
+    "}\n",
+    "if packed_Q:\n",
+    "    variant_pack[q_ragged_offset] = q_ragged_offset_gpu\n",
+    "\n",
+    "print(\"First execution\")\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run the PyTorch reference and compare against cuDNN's output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_against_torch_ref(q_gpu, k_gpu, v_gpu, o_gpu, seq_len_q_gpu):\n",
+    "    q_gpu_packed = q_gpu.detach().float().requires_grad_()\n",
+    "    k_ref = k_gpu.detach().float().requires_grad_()\n",
+    "    v_ref = v_gpu.detach().float().requires_grad_()\n",
+    "\n",
+    "    (b, h, s_q, d) = q_gpu_packed.shape\n",
+    "    s_kv = k_gpu.shape[2]\n",
+    "\n",
+    "    mask = torch.ones(b, s_q, s_kv, dtype=torch.bool, device=\"cuda\")\n",
+    "\n",
+    "    # Create attention mask for variable lengths in Q\n",
+    "    mask = torch.ones(b, s_q, s_kv, dtype=torch.bool, device=\"cuda\")\n",
+    "    for i, length in enumerate(seq_len_q_gpu):\n",
+    "        mask[i, length:, :] = False\n",
+    "\n",
+    "    # Create attention mask for variable lengths in KV\n",
+    "    for i, length in enumerate(seq_len_kv_gpu):\n",
+    "        mask[i, :, length:] = False\n",
+    "    # Causal masking\n",
+    "    for i in range(s_q):\n",
+    "        mask[:, i, i + 1 :] = False\n",
+    "\n",
+    "    # Expand mask to match attention shape\n",
+    "    mask = mask.unsqueeze(1)\n",
+    "\n",
+    "    o_ref = None\n",
+    "    if packed_Q:\n",
+    "        # Create unpacked tensor with proper shape\n",
+    "        # Convert bhsd to bshd logical layout and flatten\n",
+    "        uniform_tensor = torch.zeros(b, s_q, h, d).to(\n",
+    "            dtype=q_gpu_packed.dtype, device=q_gpu_packed.device\n",
+    "        )\n",
+    "        q_gpu_packed_thd = torch.einsum(\"bhsd->bshd\", q_gpu_packed).reshape(\n",
+    "            b * s_q, h, d\n",
+    "        )\n",
+    "\n",
+    "        # Copy the data from the packed tensor to the unpacked tensor\n",
+    "        start_idx = 0\n",
+    "        for i in range(b):\n",
+    "            s = seq_len_q_gpu[i]\n",
+    "            uniform_tensor[i, 0:s, :, :] = q_gpu_packed_thd[\n",
+    "                start_idx : start_idx + s, :, :\n",
+    "            ]\n",
+    "            start_idx += s\n",
+    "\n",
+    "        # Convert back to bhsd logical layout\n",
+    "        q_unpacked_ref = torch.einsum(\"bshd->bhsd\", uniform_tensor)\n",
+    "\n",
+    "        o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "            q_unpacked_ref,\n",
+    "            k_ref,\n",
+    "            v_ref,\n",
+    "            is_causal=False,\n",
+    "            scale=attn_scale,\n",
+    "            attn_mask=mask,\n",
+    "        )\n",
+    "    else:\n",
+    "        o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "            q_gpu_packed,\n",
+    "            k_ref,\n",
+    "            v_ref,\n",
+    "            is_causal=False,\n",
+    "            scale=attn_scale,\n",
+    "            attn_mask=mask,\n",
+    "        )\n",
+    "\n",
+    "    torch.testing.assert_close(o_ref, o_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "\n",
+    "\n",
+    "compare_against_torch_ref(q_gpu, k_gpu, v_gpu, o_gpu, seq_len_q_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/52_scaled_dot_product_attention.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Executing the same graph with different sequence lengths\n",
+    "\n",
+    "Note that the graph construction we went through earlier is a one-time cost. We can reuse the same graph for different actual sequence lengths. We illustrate this below by creating new variable sequence lengths for Q."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seq_len_q_gpu, q_ragged_offset_gpu = create_variable_seq_lens(b, s_q)\n",
+    "print(seq_len_q_gpu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    q: q_gpu,\n",
+    "    container_k: container_k_gpu,\n",
+    "    container_v: container_v_gpu,\n",
+    "    block_table_k: block_table_k_gpu,\n",
+    "    block_table_v: block_table_v_gpu,\n",
+    "    seq_len_q: seq_len_q_gpu,\n",
+    "    seq_len_kv: seq_len_kv_gpu,\n",
+    "    o: o_gpu,\n",
+    "}\n",
+    "\n",
+    "if packed_Q:\n",
+    "    variant_pack[q_ragged_offset] = q_ragged_offset_gpu\n",
+    "\n",
+    "print(\"Second execution\")\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()\n",
+    "compare_against_torch_ref(q_gpu, k_gpu, v_gpu, o_gpu, seq_len_q_gpu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudnn.destroy_handle(handle)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/samples/python/53_sdpa_decode_with_paged_caches.ipynb b/third_party/cudnn-frontend/samples/python/53_sdpa_decode_with_paged_caches.ipynb
new file mode 100644
index 00000000..d0c8af68
--- /dev/null
+++ b/third_party/cudnn-frontend/samples/python/53_sdpa_decode_with_paged_caches.ipynb
@@ -0,0 +1,474 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Paged Attention in cuDNN Frontend\n",
+    "\n",
+    "This notebook illustrates how the cuDNN's frontend scaled dot product attention operator can be used with paged K/V caches, specifically for decode. For a simpler introduction to the scaled dot product attention operator, please refer to [samples/python/50_scaled_dot_product_attention.ipynb](https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/python/50_scaled_dot_product_attention.ipynb)\n",
+    "\n",
+    "The full documentation of cuDNN's scaled dot production attention operator can be found in: [docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html](https://docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html#scaled-dot-product-attention-fp16-bf16-forward). \n",
+    "\n",
+    "The python test code for the full set of features can be found in: [test/python/test_mhas.py](https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python/test_mhas.py)\n",
+    "\n",
+    "More details on paged attention can be found in the [PagedAttention paper](https://arxiv.org/abs/2309.06180).\n",
+    "\n",
+    "\n",
+    "This notebook specifically illustrates the following:\n",
+    "- SDPA Decode (s_q=1)\n",
+    "- Paged Attention\n",
+    "- Variable sequence lengths for KV\n",
+    "- Packed Block Tables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cudnn-frontend/blob/main/samples/python/53_sdpa_decode_with_paged_caches.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU A100 or newer. If running on Colab, go to Runtime → Change runtime type → Hardware accelerator and select a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('nvidia-smi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_ipython().system('pip install nvidia-cudnn-cu12')\n",
+    "# get_ipython().system('pip install nvidia-cudnn-frontend')\n",
+    "# get_ipython().system('pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import math\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert (\n",
+    "    torch.cuda.get_device_capability()[0] >= 8\n",
+    "), \"SDPA operation is only supported on SM80 architecture (Ampere) or above\"\n",
+    "\n",
+    "assert (\n",
+    "    cudnn.backend_version() >= 90500\n",
+    "), \"SDPA operation is only supported cuDNN version 9.5.0 or above\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Problem sizes and Q/K/V setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the query, key, value, and output GPU tensors using PyTorch. However, the user may use any DLPack compatible tensor instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = 2  # batch size\n",
+    "h = 12  # query number of heads\n",
+    "s_q = 1  # For decode, we only have one query token\n",
+    "s_kv = 1024  # maximum sequence length\n",
+    "d = 64  # embedding dimension per head\n",
+    "\n",
+    "block_size_k = block_size_v = (\n",
+    "    64  # block size to be used by the non contiguous K/V containers\n",
+    ")\n",
+    "\n",
+    "attn_scale = 1.0 / math.sqrt(d)\n",
+    "\n",
+    "# BSHD (batch, sequence_length, num_head, dims_per_head) logcial and physical tensor layouts\n",
+    "dims_qo = (b, h, s_q, d)\n",
+    "strides_qo = (s_q * h * d, s_q * d, d, 1)\n",
+    "\n",
+    "dims_kv = (b, h, s_kv, d)\n",
+    "strides_kv = (s_kv * h * d, s_kv * d, d, 1)\n",
+    "\n",
+    "q_gpu = torch.randn(b * s_q * h * d).half().cuda().as_strided(dims_qo, strides_qo)\n",
+    "k_gpu = torch.randn(b * s_kv * h * d).half().cuda().as_strided(dims_kv, strides_kv)\n",
+    "v_gpu = torch.randn(b * s_kv * h * d).half().cuda().as_strided(dims_kv, strides_kv)\n",
+    "o_gpu = torch.empty(b * s_q * h * d).half().cuda().as_strided(dims_qo, strides_qo)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####  Generate containers and block tables for K and V\n",
+    "\n",
+    "In a real world scenario, container and block table tensors are generated by other parts of the model. For illustration purposes in this example, we provide a helper function to generate a trivial container from contiguous K and V caches. \n",
+    "The helper function basically takes e.g., the K-cache and splits up the sequence (`S`) dimension in different blocks of length `block_size`. The resulting block table then helps identify which block belongs to which sequence ID."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# @brief Helper function to create a non contiguous container in blocks of block_size from a contiguous tensor\n",
+    "def create_container_and_block_table(tensor, block_size):\n",
+    "    B, H, S, D = tensor.shape\n",
+    "    # num_blocks = math.ceil(S/block_size) * B\n",
+    "    blocks_per_batch = math.ceil(S / block_size)\n",
+    "\n",
+    "    # Only needed if S is not a multiple of block_size\n",
+    "    padding_seq = (blocks_per_batch * block_size) - S\n",
+    "    if padding_seq > 0:\n",
+    "        zeros = torch.zeros(B, H, padding_seq, D, device=\"cuda\", dtype=tensor.dtype)\n",
+    "        cat_tensor = torch.cat((tensor, zeros), axis=2)\n",
+    "    else:\n",
+    "        cat_tensor = tensor\n",
+    "\n",
+    "    # Create a container by splitting on the S dimension and concatenating at the block dimension\n",
+    "    # Its dimensions are [num_blocks, H, block_size, D] with num_blocks = B * blocks_per_batch\n",
+    "    container = torch.cat((cat_tensor.clone()).chunk(blocks_per_batch, dim=2), dim=0)\n",
+    "\n",
+    "    # Create the block table\n",
+    "    table_size = math.ceil(S / block_size)\n",
+    "    block_table_temp = torch.linspace(\n",
+    "        0, B * table_size - 1, B * table_size, device=\"cuda\", dtype=torch.int32\n",
+    "    ).reshape(table_size, 1, B, 1)\n",
+    "    block_table_temp = torch.transpose(block_table_temp, 0, 2)\n",
+    "\n",
+    "    # Make batch size outer dimension (cuDNN backend preference)\n",
+    "    block_table = (\n",
+    "        torch.zeros(blocks_per_batch * B)\n",
+    "        .int()\n",
+    "        .cuda()\n",
+    "        .as_strided(\n",
+    "            (B, 1, blocks_per_batch, 1), (blocks_per_batch, blocks_per_batch, 1, 1)\n",
+    "        )\n",
+    "    )\n",
+    "    block_table.copy_(block_table_temp)\n",
+    "\n",
+    "    return (container, block_table)\n",
+    "\n",
+    "\n",
+    "# Create non contiguous containers with block tables for K and V from the contiguous k_gpu and v_gpu\n",
+    "container_k_gpu, block_table_k = create_container_and_block_table(k_gpu, block_size_k)\n",
+    "container_v_gpu, block_table_v = create_container_and_block_table(v_gpu, block_size_v)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Variable KV sequence lengths and packed block tables\n",
+    "Note that we created block tables containing block offsets for every sequence ID up to s_kv-1. However, with variable sequence lengths, we don't need block offsets for sequence ID's beyond the actual sequence length per batch. Therefore, we can consider \"packing\" the block tables, by only storing the block offsets that are needed, similar to how ragged tensors work. It should be noted that due to the small size of block tables, the amount of memory transfer reduction is minimal, and performance is expected to slightly degrade with this technique (this is because packing block tables removes the ability to user vectorized loads). However, for compatibility reasons with many of the frameworks, this feature can still be useful."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start with creating the actual sequence length tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# In decode, s_q is set to 1 for all batches\n",
+    "seq_len_q_gpu = torch.ones((b, 1, 1, 1), device=\"cuda\", dtype=torch.int32)\n",
+    "\n",
+    "# Create an actual sequence length tensor for KV\n",
+    "seq_len_kv_gpu = torch.randint(1, s_kv, (b, 1, 1, 1), device=\"cuda\", dtype=torch.int32)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's pack the previously created block tables. We use the following helper function to do so:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# @brief Helper function to convert a padded block table into a packed block table\n",
+    "# @return packed_block_table: packed block table\n",
+    "# @return ragged_offset: offset into the packed block table\n",
+    "def convert_uniform_to_ragged_block_tables(uniform_tensor, seq_len, block_size):\n",
+    "    [B, H, S, D] = uniform_tensor.size()\n",
+    "    ragged_offset = torch.zeros(\n",
+    "        B + 1, 1, 1, 1, dtype=torch.int32, device=uniform_tensor.device\n",
+    "    )  # Initialize with first offset as 0\n",
+    "    for i in range(1, B + 1):\n",
+    "        prev_seq_len = seq_len[i - 1]\n",
+    "        num_pages_prev_batch = (prev_seq_len + block_size - 1) // block_size\n",
+    "        next_batch_offset = ragged_offset[i - 1] + num_pages_prev_batch\n",
+    "        ragged_offset[i, 0, 0, 0] = next_batch_offset\n",
+    "\n",
+    "    ragged_offset.to(dtype=torch.int64)\n",
+    "\n",
+    "    packed_block_table = torch.zeros(B * S, H, D).to(\n",
+    "        dtype=uniform_tensor.dtype, device=uniform_tensor.device\n",
+    "    )\n",
+    "\n",
+    "    uniform_tensor_thd = torch.einsum(\"bhsd->bshd\", uniform_tensor).reshape(B * S, H, D)\n",
+    "\n",
+    "    t_0 = 0\n",
+    "    for b, t_1 in enumerate(ragged_offset.flatten()[1:]):\n",
+    "        packed_block_table[t_0:t_1, :, :] = uniform_tensor_thd[\n",
+    "            b * S : b * S + (t_1 - t_0), :, :\n",
+    "        ]\n",
+    "        t_0 = t_1\n",
+    "\n",
+    "    packed_block_table = packed_block_table.reshape(B, S, H, D)\n",
+    "    packed_block_table = torch.einsum(\"bshd->bhsd\", packed_block_table)\n",
+    "\n",
+    "    return packed_block_table, ragged_offset\n",
+    "\n",
+    "\n",
+    "block_table_k_packed_gpu, block_table_k_ragged_offset_gpu = (\n",
+    "    convert_uniform_to_ragged_block_tables(block_table_k, seq_len_kv_gpu, block_size_k)\n",
+    ")\n",
+    "block_table_v_packed_gpu, block_table_v_ragged_offset_gpu = (\n",
+    "    convert_uniform_to_ragged_block_tables(block_table_v, seq_len_kv_gpu, block_size_v)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`block_table_{k,v}_packed_gpu` are now packed block tables, containing only the block offsets that are needed for the actual sequence lengths.\n",
+    "`block_table_{k,v}_ragged_offset_gpu` are the ragged offsets into the packed block tables. They indicate the start of the offsets for each sequence. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To illustrate this, consider a scenario where `seq_len_kv_gpu = {250,300}`, and assume further that the container has blocks contiguously allocated per batch (so the block table offsets are just linear increments).\n",
+    "\n",
+    "A padded page table would be:\n",
+    "block_table = {B,1, max_s_kv/block_size, 1} = {2,1,16,1}\n",
+    "```\n",
+    "b = 0 : \n",
+    "    block_table_k[0,0] = 0\n",
+    "    block_table_k[0,1] = 1\n",
+    "    block_table_k[0,2] = 2\n",
+    "    block_table_k[0,3] = 3\n",
+    "    block_table_k[0,4] = x\n",
+    "    block_table_k[0,5] = x\n",
+    "    ...\n",
+    "    block_table_k[0,15] = x\n",
+    "\n",
+    "b = 1 : \n",
+    "    block_table_k[1,0] = 16\n",
+    "    block_table_k[1,1] = 17\n",
+    "    block_table_k[1,2] = 18\n",
+    "    block_table_k[1,3] = 19\n",
+    "    block_table_k[1,4] = 20\n",
+    "    block_table_k[1,5] = x\n",
+    "    block_table_k[1,6] = x\n",
+    "    ...\n",
+    "    block_table_k[1,15] = x\n",
+    "\n",
+    "```\n",
+    "Since only 4 and 5 elements contain meaningful block offsets, for batch 0 and 1 respectively (seq_len_kv_gpu/block_size=[4,5]), a packed page table would be:\n",
+    "``` \n",
+    "block_table_k_packed_gpu = [0,1,2,3,16,17,18,19,20]\n",
+    "```\n",
+    "With ragged offests\n",
+    "```\n",
+    "block_table_k,_ragged_offset_gpu = [0, 4, 9]\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Graph creation and execution\n",
+    "\n",
+    "Create the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q = graph.tensor_like(q_gpu)\n",
+    "\n",
+    "container_k = graph.tensor_like(container_k_gpu)\n",
+    "container_v = graph.tensor_like(container_v_gpu)\n",
+    "block_table_k_packed = graph.tensor_like(block_table_k_packed_gpu)\n",
+    "block_table_v_packed = graph.tensor_like(block_table_v_packed_gpu)\n",
+    "\n",
+    "seq_len_q = graph.tensor_like(seq_len_q_gpu)\n",
+    "seq_len_kv = graph.tensor_like(seq_len_kv_gpu)\n",
+    "\n",
+    "# Add ragged offset tensors to the block tables\n",
+    "block_table_k_ragged_offset = graph.tensor_like(block_table_k_ragged_offset_gpu)\n",
+    "block_table_k_packed.set_ragged_offset(block_table_k_ragged_offset)\n",
+    "block_table_v_ragged_offset = graph.tensor_like(block_table_v_ragged_offset_gpu)\n",
+    "block_table_v_packed.set_ragged_offset(block_table_v_ragged_offset)\n",
+    "\n",
+    "o, _ = graph.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q,\n",
+    "    k=container_k,  # Container K: non contiguous container with K blocks\n",
+    "    v=container_v,  # Container V: non contiguous container with V blocks\n",
+    "    is_inference=True,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=False,\n",
+    "    use_padding_mask=True,\n",
+    "    seq_len_q=seq_len_q,\n",
+    "    seq_len_kv=seq_len_kv,\n",
+    "    paged_attention_k_table=block_table_k_packed,  # Block Table K: Tensor containing offsets to the container with K blocks\n",
+    "    paged_attention_v_table=block_table_v_packed,  # Block Table V: Tensor containing offsets to the container with V blocks\n",
+    "    paged_attention_max_seq_len_kv=s_kv,  # The maximum sequence length for K caches (this is optional, but recommended)\n",
+    ")\n",
+    "\n",
+    "o.set_output(True).set_dim(dims_qo).set_stride(strides_qo)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    q: q_gpu,\n",
+    "    container_k: container_k_gpu,\n",
+    "    container_v: container_v_gpu,\n",
+    "    block_table_k_packed: block_table_k_packed_gpu,\n",
+    "    block_table_v_packed: block_table_v_packed_gpu,\n",
+    "    block_table_k_ragged_offset: block_table_k_ragged_offset_gpu,  # Ragged offset for K's block table\n",
+    "    block_table_v_ragged_offset: block_table_v_ragged_offset_gpu,  # Ragged offset for V's block table\n",
+    "    seq_len_q: seq_len_q_gpu,\n",
+    "    seq_len_kv: seq_len_kv_gpu,\n",
+    "    o: o_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()\n",
+    "\n",
+    "cudnn.destroy_handle(handle)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Run the PyTorch reference and compare against cuDNN's output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q_ref = q_gpu.detach().float().requires_grad_()\n",
+    "k_ref = k_gpu.detach().float().requires_grad_()\n",
+    "v_ref = v_gpu.detach().float().requires_grad_()\n",
+    "\n",
+    "# Create attention mask for variable lengths in KV\n",
+    "mask = torch.ones(b, s_kv, dtype=torch.bool, device=\"cuda\")\n",
+    "\n",
+    "for i in range(b):\n",
+    "    seqlen = seq_len_kv_gpu[i, 0, 0, 0].item()\n",
+    "    mask[i, seqlen:] = False\n",
+    "\n",
+    "# Expand mask (B,s_kv) -> (B,1,1,s_kv) to match attention shape\n",
+    "mask = mask.unsqueeze(1)\n",
+    "mask = mask.unsqueeze(1)\n",
+    "\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(\n",
+    "    q_ref, k_ref, v_ref, is_causal=False, scale=attn_scale, attn_mask=mask\n",
+    ")\n",
+    "\n",
+    "torch.testing.assert_close(o_ref, o_gpu.float(), atol=5e-3, rtol=3e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/third_party/cudnn-frontend/setup.py b/third_party/cudnn-frontend/setup.py
new file mode 100644
index 00000000..ac9ba60d
--- /dev/null
+++ b/third_party/cudnn-frontend/setup.py
@@ -0,0 +1,112 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+
+
+# A CMakeExtension needs a sourcedir instead of a file list.
+# The name must be the _single_ output extension from the CMake build.
+# If you need multiple extensions, see scikit-build.
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: str = "") -> None:
+        super().__init__(name, sources=[])
+        self.sourcedir = os.fspath(Path(sourcedir).resolve())
+
+
+class CMakeBuild(build_ext):
+    def build_extension(self, ext: CMakeExtension) -> None:
+        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
+        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
+        extdir = ext_fullpath.parent.resolve()
+
+        # Using this requires trailing slash for auto-detection & inclusion of
+        # auxiliary "native" libs
+
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "Release"
+
+        is_windows = os.name == "nt"
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        cmake_args = []
+
+        if is_windows == False:
+            cmake_args += [
+                f"-DPython_EXECUTABLE={sys.executable}",
+            ]
+
+        cmake_args = [
+            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+            f"-DCUDNN_FRONTEND_BUILD_PYTHON_BINDINGS=ON",
+            # There's no need to build cpp samples and tests with python
+            f"-DCUDNN_FRONTEND_BUILD_SAMPLES=OFF",
+            f"-DCUDNN_FRONTEND_BUILD_TESTS=OFF",
+            # All these are handled by pip
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
+            f"-DCUDNN_FRONTEND_KEEP_PYBINDS_IN_BINARY_DIR=OFF",
+        ]
+
+        if is_windows:
+            cmake_args += [
+                f"-DCUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE=ON",
+            ]
+        else:
+            cmake_args += [
+                f"-DCUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE=OFF",
+            ]
+        if "CUDA_PATH" in os.environ:
+            cmake_args.append(f"-DCUDAToolkit_ROOT={os.environ['CUDA_PATH']}")
+
+        if "CUDAToolkit_ROOT" in os.environ:
+            cmake_args.append(f"-DCUDAToolkit_ROOT={os.environ['CUDAToolkit_ROOT']}")
+
+        if "CUDNN_PATH" in os.environ:
+            cmake_args.append(f"-DCUDNN_PATH={os.environ['CUDNN_PATH']}")
+
+        if "FETCHCONTENT_SOURCE_DIR_DLPACK" in os.environ:
+            cmake_args.append(f"-DFETCHCONTENT_SOURCE_DIR_DLPACK={os.environ['FETCHCONTENT_SOURCE_DIR_DLPACK']}")
+
+        # Using Ninja-build since it a) is available as a wheel and b)
+        # multithreads automatically. MSVC would require all variables be
+        # exported for Ninja to pick it up, which is a little tricky to do.
+        # Users can override the generator with CMAKE_GENERATOR in CMake
+        # 3.15+.
+        if is_windows == False:
+            try:
+                import ninja
+
+                ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
+                cmake_args += [
+                    "-GNinja",
+                    f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                ]
+            except ImportError:
+                pass
+
+        build_args = []
+        if is_windows:
+            build_args += [f"--config Release"]
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]
+
+        build_temp = Path(self.build_temp) / ext.name
+        if not build_temp.exists():
+            build_temp.mkdir(parents=True)
+
+        print(" ".join(cmake_args))
+        subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True)
+        subprocess.run(["cmake", "--build", ".", *build_args], cwd=build_temp, check=True)
+
+
+setup(
+    ext_modules=[CMakeExtension("cudnn._compiled_module")],
+    cmdclass={"build_ext": CMakeBuild},
+)
diff --git a/third_party/cudnn-frontend/test/CMakeLists.txt b/third_party/cudnn-frontend/test/CMakeLists.txt
new file mode 100644
index 00000000..86822eb9
--- /dev/null
+++ b/third_party/cudnn-frontend/test/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.18)
+
+add_subdirectory(cpp)
diff --git a/third_party/cudnn-frontend/test/cpp/CMakeLists.txt b/third_party/cudnn-frontend/test/cpp/CMakeLists.txt
new file mode 100644
index 00000000..27502948
--- /dev/null
+++ b/third_party/cudnn-frontend/test/cpp/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.18)
+
+find_package(Catch2 QUIET)
+
+if(NOT Catch2_FOUND)
+    Include(FetchContent)
+
+    # Fetch and build catch2
+    FetchContent_Declare(
+      Catch2
+      GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+      GIT_TAG        v3.3.2
+    )
+    FetchContent_MakeAvailable(Catch2)
+endif()
+
+# Find cudnn
+include(${PROJECT_SOURCE_DIR}/cmake/cuDNN.cmake)
+
+add_executable(
+    tests
+
+    pointwise_tests.cpp
+    serialize.cpp
+    validate.cpp
+    version.cpp
+    tensor.cpp
+)
+
+if (MSVC)
+    target_compile_options(
+        tests PRIVATE
+        /W4 /WX # warning level 3 and all warnings as errors
+        /wd4100 # allow unused parameters
+        /wd4458 # local hides class member (currently a problem for all inline setters)
+        /wd4505 # unreferenced function with internal linkage has been removed
+        /wd4101 /wd4189 # unreferenced local
+        /bigobj # increase number of sections in .Obj file
+    )
+else()
+    target_compile_options(
+        tests PRIVATE
+        -Wall
+        -Wextra
+        -Werror
+        -Wno-unused-function
+    )
+endif()
+
+target_link_libraries(
+    tests
+    cudnn_frontend
+    _cudnn_frontend_pch
+    Catch2::Catch2WithMain
+
+    CUDNN::cudnn
+
+    CUDA::cublasLt
+    CUDA::cudart
+    CUDA::nvrtc
+)
+
+# cuDNN dlopen's its libraries
+# Add all libraries in link line as NEEDED
+set_target_properties(
+    tests
+    PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin
+)
diff --git a/third_party/cudnn-frontend/test/cpp/pointwise_tests.cpp b/third_party/cudnn-frontend/test/cpp/pointwise_tests.cpp
new file mode 100644
index 00000000..ad3e350c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/cpp/pointwise_tests.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Pointwise shape deduction", "[pointwise_shape_deduction]") {
+    namespace fe = cudnn_frontend;
+
+    cudnnHandle_t handle;
+    cudnnCreate(&handle);
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto in0 = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("in0").set_dim({8, 128, 16000, 1}).set_stride({2048000, 1, 128, 128}));
+
+    auto in1 = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("in1").set_dim({1, 128, 1, 1}).set_stride({128, 1, 128, 128}));
+
+    auto add_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+
+    auto out_0 = graph.pointwise(in0, in1, add_options);
+
+    out_0->set_output(true);
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(out_0->get_dim() == in0->get_dim());
+    REQUIRE(out_0->get_stride() == in0->get_stride());
+
+    cudnnDestroy(handle);
+}
+
+TEST_CASE("Pointwise Add shape deduction", "[pointwise_shape_deduction]") {
+    namespace fe = cudnn_frontend;
+
+    cudnnHandle_t handle;
+    cudnnCreate(&handle);
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto in0 = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("in0").set_dim({1, 4194304, 1}).set_stride({1, 1, 4194304}));
+
+    auto in1 =
+        graph.tensor(fe::graph::Tensor_attributes().set_name("in1").set_dim({1, 4194304, 32}).set_stride({1, 32, 1}));
+
+    auto add_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+
+    auto out_0 = graph.pointwise(in0, in1, add_options);
+    out_0->set_output(true);
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+
+    REQUIRE(out_0->get_dim() == in1->get_dim());
+    REQUIRE(out_0->get_stride() == in1->get_stride());
+
+    cudnnDestroy(handle);
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/test/cpp/serialize.cpp b/third_party/cudnn-frontend/test/cpp/serialize.cpp
new file mode 100644
index 00000000..2cef0892
--- /dev/null
+++ b/third_party/cudnn-frontend/test/cpp/serialize.cpp
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Tensor attributes", "[tensor][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    auto tensor_attributes = fe::graph::Tensor_attributes()
+                                 .set_name("image")
+                                 .set_dim({4, 32, 16, 16})
+                                 .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                                 .set_is_virtual(true)
+                                 .set_is_pass_by_value(true)
+                                 .set_uid(12312)
+                                 .set_reordering_type(fe::TensorReordering_t::F16x16)
+                                 .set_data_type(fe::DataType_t::HALF);
+
+    json j                              = tensor_attributes;
+    auto tensor_attributes_deserialized = j;
+
+    REQUIRE(tensor_attributes_deserialized == tensor_attributes);
+}
+
+TEST_CASE("Context serialization", "[context][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT)
+        .set_sm_count(24);
+
+    json j = graph;
+
+    std::cout << j << std::endl;
+
+    fe::graph::Graph graph_deserialized;
+
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+
+    REQUIRE(graph.validate().is_good());
+}
+
+TEST_CASE("Conv fprop attributes", "[conv_fprop][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    auto x = std::make_shared<fe::graph::Tensor_attributes>();
+    x->set_name("image")
+        .set_dim({4, 32, 16, 16})
+        .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+        .set_is_virtual(true)
+        .set_is_pass_by_value(true)
+        .set_uid(12312)
+        .set_reordering_type(fe::TensorReordering_t::F16x16)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto conv_fprop_attributes = fe::graph::Conv_fprop_attributes()
+                                     .set_name("conv_fprop")
+                                     .set_padding({1, 1})
+                                     .set_stride({1, 1})
+                                     .set_dilation({1, 1})
+                                     .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    json j                                  = conv_fprop_attributes;
+    auto conv_fprop_attributes_deserialized = j;
+
+    REQUIRE(conv_fprop_attributes_deserialized == conv_fprop_attributes);
+}
+
+TEST_CASE("Graph key", "[graph][key]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("image").set_dim({4, 16, 64}).set_stride({16 * 64, 1, 16}));
+    auto Y = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("filter").set_dim({4, 64, 32}).set_stride({32 * 64, 1, 64}));
+
+    fe::graph::Matmul_attributes matmul;
+    auto Z = graph.matmul(X, Y, matmul);
+
+    auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto S             = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("scale").set_dim({4, 16, 32}).set_stride({16 * 32, 32, 1}));
+    auto scale_output = graph.pointwise(Z, S, scale_options);
+
+    auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+    auto B =
+        graph.tensor(fe::graph::Tensor_attributes().set_name("bias").set_dim({4, 16, 32}).set_stride({16 * 32, 32, 1}));
+    auto bias_output = graph.pointwise(scale_output, B, bias_options);
+
+    auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+    auto O            = graph.pointwise(bias_output, relu_options);
+    O->set_output(true);
+
+    cudnnHandle_t handle;
+    cudnnCreate(&handle);
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    auto key = graph.key();
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(key == graph.key());
+
+    REQUIRE(graph.check_support().is_good());
+    REQUIRE(key == graph.key());
+
+    REQUIRE(graph.build_plans().is_good());
+    REQUIRE(key == graph.key());
+
+    cudnnDestroy(handle);
+}
+
+TEST_CASE("Graph key dynamic shape", "[graph][key][dynamic_shape]") {
+    namespace fe = cudnn_frontend;
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    // clang-format off
+    struct {
+        int64_t b,    m,    n,    k;
+    } shapes[] = {
+        {       4,   16,   32,   64},
+        {       8,   32,   64,  128},
+    };
+    // clang-format on
+
+    constexpr int shapes_count = sizeof(shapes) / sizeof(shapes[0]);
+    size_t key                 = 0;  // Save key between runs to verify that dim and stride information is deleted
+
+    for (int idx_shape = 0; idx_shape < shapes_count; idx_shape++) {
+        auto b = shapes[idx_shape].b;
+        auto m = shapes[idx_shape].m;
+        auto n = shapes[idx_shape].n;
+        auto k = shapes[idx_shape].k;
+
+        fe::graph::Graph graph;
+        graph.set_io_data_type(fe::DataType_t::HALF)
+            .set_intermediate_data_type(fe::DataType_t::FLOAT)
+            .set_compute_data_type(fe::DataType_t::FLOAT)
+            .set_dynamic_shape_enabled(true);
+
+        auto X =
+            graph.tensor(fe::graph::Tensor_attributes().set_name("image").set_dim({b, m, k}).set_stride({m * k, 1, m}));
+        auto Y = graph.tensor(
+            fe::graph::Tensor_attributes().set_name("filter").set_dim({b, k, n}).set_stride({n * k, 1, k}));
+
+        fe::graph::Matmul_attributes matmul;
+        auto Z = graph.matmul(X, Y, matmul);
+
+        auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+        auto S =
+            graph.tensor(fe::graph::Tensor_attributes().set_name("scale").set_dim({b, m, n}).set_stride({m * n, n, 1}));
+        auto scale_output = graph.pointwise(Z, S, scale_options);
+
+        auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+        auto B =
+            graph.tensor(fe::graph::Tensor_attributes().set_name("bias").set_dim({b, m, n}).set_stride({m * n, n, 1}));
+        auto bias_output = graph.pointwise(scale_output, B, bias_options);
+
+        auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+        auto O            = graph.pointwise(bias_output, relu_options);
+        O->set_output(true);
+
+        cudnnHandle_t handle;
+        cudnnCreate(&handle);
+
+        auto status = graph.validate();
+        if (cudnnGetVersion() >= 90400) {
+            REQUIRE(status.is_good());
+        } else {
+            REQUIRE(status.is_bad());
+            SKIP("Dynamic shapes not supported pre 9.4");
+        }
+
+        REQUIRE(graph.build_operation_graph(handle).is_good());
+
+        if (!key) {
+            key = graph.key();
+        }
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+        REQUIRE(key == graph.key());
+
+        REQUIRE(graph.check_support().is_good());
+        REQUIRE(key == graph.key());
+
+        REQUIRE(graph.build_plans().is_good());
+        REQUIRE(key == graph.key());
+
+        cudnnDestroy(handle);
+    }
+}
+
+TEST_CASE("Matmul fp8 fusion", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    fe::graph::Graph graph{};
+
+    // Create the two non-virtual input tensors A and B.
+    // There are read from global memory.
+    auto A_attributes = fe::graph::Tensor_attributes()
+                            .set_name("A")
+                            .set_dim({b, m, k})
+                            .set_stride({m * k, k, 1})
+                            .set_data_type(fe::DataType_t::FP8_E4M3);
+    auto A = graph.tensor(A_attributes);
+
+    auto B_attributes = fe::graph::Tensor_attributes()
+                            .set_name("B")
+                            .set_dim({b, k, n})
+                            .set_stride({k * n, 1, k})
+                            .set_data_type(fe::DataType_t::FP8_E4M3);
+    auto B = graph.tensor(B_attributes);
+
+    auto A_descale_attributes = fe::graph::Tensor_attributes()
+                                    .set_name("descale0")
+                                    .set_dim({1, 1, 1})
+                                    .set_stride({1, 1, 1})
+                                    .set_data_type(fe::DataType_t::FLOAT);
+    auto B_descale_attributes = fe::graph::Tensor_attributes()
+                                    .set_name("descale1")
+                                    .set_dim({1, 1, 1})
+                                    .set_stride({1, 1, 1})
+                                    .set_data_type(fe::DataType_t::FLOAT);
+
+    auto A_descale = graph.tensor(A_descale_attributes);
+    auto B_descale = graph.tensor(B_descale_attributes);
+
+    auto matmul_attributes =
+        // fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+        fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.matmul(A, B, matmul_attributes);
+    C->set_data_type(fe::DataType_t::FLOAT);
+
+    // Add scale_A operation
+    auto pw_0_attributes = fe::graph::Pointwise_attributes()
+                               //    .set_name("pw0_Mul")
+                               .set_mode(fe::PointwiseMode_t::MUL)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C_after_pw_0 = graph.pointwise(C, A_descale, pw_0_attributes);
+    C_after_pw_0->set_data_type(fe::DataType_t::FLOAT);
+
+    // Add descale_B operation
+    auto pw_1_attributes = fe::graph::Pointwise_attributes()
+                               //    .set_name("pw1_Mul")
+                               .set_mode(fe::PointwiseMode_t::MUL)
+                               .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C_after_pw_1 = graph.pointwise(C_after_pw_0, B_descale, pw_1_attributes);
+    C_after_pw_1->set_output(true).set_data_type(fe::DataType_t::BFLOAT16);
+
+    json j = graph;
+
+    std::cout << j << std::endl;
+
+    fe::graph::Graph graph_deserialized;
+
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+
+    REQUIRE(graph.validate().is_good());
+
+    std::cout << "Validating deserialized graph" << std::endl;
+
+    cudnnHandle_t handle;  // Handle to use during deserialize and execute
+
+    cudnnCreate(&handle);
+
+    REQUIRE(graph_deserialized.validate().is_good());
+
+    REQUIRE(graph_deserialized.build_operation_graph(handle).is_good());
+
+    cudnnDestroy(handle);
+}
+
+TEST_CASE("conv graph serialization", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+
+    auto x = graph.tensor(fe::graph::Tensor_attributes());
+    x->set_name("image")
+        .set_dim({4, 32, 16, 16})
+        .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+        .set_is_virtual(false)
+        .set_is_pass_by_value(false)
+        .set_reordering_type(fe::TensorReordering_t::NONE)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto w = graph.tensor(fe::graph::Tensor_attributes());
+    w->set_name("weight")
+        .set_dim({64, 32, 3, 3})
+        .set_stride({32 * 3 * 3, 1, 32 * 3, 32})
+        .set_is_virtual(false)
+        .set_is_pass_by_value(false)
+        .set_reordering_type(fe::TensorReordering_t::NONE)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto conv_fprop_attributes = fe::graph::Conv_fprop_attributes()
+                                     .set_name("conv_fprop")
+                                     .set_padding({1, 1})
+                                     .set_stride({1, 1})
+                                     .set_dilation({1, 1})
+                                     .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto y = graph.conv_fprop(x, w, conv_fprop_attributes);
+
+    auto b = graph.tensor(fe::graph::Tensor_attributes());
+    b->set_name("bias")
+        .set_dim({1, 64, 1, 1})
+        .set_stride({64, 1, 64, 64})
+        .set_is_virtual(false)
+        .set_is_pass_by_value(false)
+        .set_reordering_type(fe::TensorReordering_t::NONE)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto pointwise_attributes = fe::graph::Pointwise_attributes()
+                                    .set_name("bias")
+                                    .set_mode(fe::PointwiseMode_t::ADD)
+                                    .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto o = graph.pointwise(y, b, pointwise_attributes);
+
+    auto reduction_attributes = fe::graph::Reduction_attributes()
+                                    .set_name("reduction")
+                                    .set_mode(fe::ReductionMode_t::ADD)
+                                    .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto r = graph.reduction(o, reduction_attributes);
+
+    r->set_output(true).set_data_type(fe::DataType_t::HALF);
+
+    json j = graph;
+
+    fe::graph::Graph graph_deserialized;
+
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+
+    REQUIRE(graph_deserialized.validate().is_good());
+}
+
+TEST_CASE("sdpa graph serialization", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    int64_t b    = 3;     // batch size
+    int64_t h    = 4;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+    auto Q = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("Q")
+                              .set_dim({b, h, s_q, d})
+                              .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+    auto K = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("K")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+    auto V = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("V")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+
+    auto attn_scale = graph.tensor(fe::graph::Tensor_attributes()
+                                       .set_name("attn_scale")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_is_pass_by_value(true)
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    auto seed   = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Seed")
+                                 .set_dim({1, 1, 1, 1})
+                                 .set_stride({1, 1, 1, 1})
+                                 .set_data_type(fe::DataType_t::INT32));
+    auto offset = graph.tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Offset")
+                                   .set_dim({1, 1, 1, 1})
+                                   .set_stride({1, 1, 1, 1})
+                                   .set_data_type(fe::DataType_t::INT32));
+
+    auto bias = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({b, 1, s_q, s_kv})
+                                 .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+
+    auto seq_q  = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("seq_q")
+                                  .set_dim({b, 1, 1, 1})
+                                  .set_stride({1, 1, 1, 1})
+                                  .set_data_type(fe::DataType_t::INT32));
+    auto seq_kv = graph.tensor(fe::graph::Tensor_attributes()
+                                   .set_name("seq_kv")
+                                   .set_dim({b, 1, 1, 1})
+                                   .set_stride({1, 1, 1, 1})
+                                   .set_data_type(fe::DataType_t::INT32));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_generate_stats(true)
+                            .set_attn_scale(attn_scale)
+                            .set_alibi_mask(true)
+                            .set_diagonal_band_right_bound(0)
+                            .set_dropout(0.1f, seed, offset)
+                            .set_bias(bias)
+                            .set_padding_mask(true)
+                            .set_seq_len_q(seq_q)
+                            .set_seq_len_kv(seq_kv);
+
+    auto [O, stats] = graph.sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h, s_q, d}).set_stride({h * d, d, b * h * d, 1});
+    stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    json j = graph;
+
+    fe::graph::Graph graph_deserialized;
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+}
+
+TEST_CASE("sdpa backward graph serialization", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    int64_t b    = 3;     // batch size
+    int64_t h    = 4;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    std::shared_ptr<fe::graph::Tensor_attributes> bias, dropout_seed, dropout_offset;
+
+    auto q = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("Q").set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1}));
+    auto k = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("K")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({h * s_kv * d, s_kv * d, d, 1}));
+    auto v = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("V")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({h * s_kv * d, s_kv * d, d, 1}));
+    auto o = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("O").set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1}));
+    auto dO = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("dO").set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1}));
+    auto stats = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("stats")
+                                  .set_dim({b, h, s_q, 1})
+                                  .set_stride({h * s_q, s_q, 1, 1})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    auto attn_scale = graph.tensor(fe::graph::Tensor_attributes()
+                                       .set_name("attn_scale")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_is_pass_by_value(true)
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    bias = graph.tensor(fe::graph::Tensor_attributes()
+                            .set_name("bias")
+                            .set_dim({b, 1, s_q, s_kv})
+                            .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+
+    dropout_seed   = graph.tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+    dropout_offset = graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+
+    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                     .set_name("flash_attention_backward")
+                                     .set_diagonal_alignment(cudnn_frontend::DiagonalAlignment_t::TOP_LEFT)
+                                     .set_diagonal_band_right_bound(0)
+                                     .set_attn_scale(attn_scale)
+                                     .set_bias(bias)
+                                     .set_dropout(0.1f, dropout_seed, dropout_offset);
+
+    auto [dQ, dK, dV] = graph.sdpa_backward(q, k, v, o, dO, stats, sdpa_backward_options);
+
+    dQ->set_output(true).set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1});
+    dK->set_output(true).set_dim({b, h, s_kv, d}).set_stride({h * s_kv * d, s_kv * d, d, 1});
+    dV->set_output(true).set_dim({b, h, s_kv, d}).set_stride({h * s_kv * d, s_kv * d, d, 1});
+
+    json j = graph;
+    fe::graph::Graph graph_deserialized;
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+
+    REQUIRE(graph_deserialized.validate().is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/test/cpp/tensor.cpp b/third_party/cudnn-frontend/test/cpp/tensor.cpp
new file mode 100644
index 00000000..dbe87ec9
--- /dev/null
+++ b/third_party/cudnn-frontend/test/cpp/tensor.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("tensor query checks", "[query_tensor_attributes_of_uid]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    int64_t uid      = 1;
+    std::string name = "image";
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name(name)
+                              .set_dim({8, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                              .set_uid(uid));
+
+    fe::graph::Tensor_attributes t;
+
+    REQUIRE(graph.query_tensor_attributes_of_uid(uid, t).is_good());
+
+    REQUIRE(t.get_name() == name);
+}
+
+TEST_CASE("Block_scale_dequantize graph creation with negative scales", "[block_scale_dequantize_graph]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    int64_t batch = 1, M = 32, K = 32;
+    std::vector<int32_t> block_size = {1, 32};
+
+    // Create input tensor (quantized)
+    auto X = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("quantized_input")
+                              .set_dim({batch, M, K})
+                              .set_stride({M * K, K, 1})
+                              .set_data_type(fe::DataType_t::INT8));
+
+    // Create scale tensor
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("scale")
+                                  .set_dim({batch, M, K / block_size[1]})
+                                  .set_stride({M * (K / block_size[1]), K / block_size[1], 1})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    // Test with is_negative_scale = true
+    auto attributes_negative = fe::graph::Block_scale_dequantize_attributes()
+                                   .set_name("dq_negative")
+                                   .set_block_size(block_size)
+                                   .set_is_negative_scale(true)
+                                   .set_compute_data_type(fe::DataType_t::FLOAT);
+    REQUIRE(attributes_negative.get_is_negative_scale() == true);
+
+    auto Y_negative = graph.block_scale_dequantize(X, scale, attributes_negative);
+    Y_negative->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    // Test with is_negative_scale = false (default)
+    auto attributes_positive = fe::graph::Block_scale_dequantize_attributes()
+                                   .set_name("dq_positive")
+                                   .set_block_size(block_size)
+                                   .set_compute_data_type(fe::DataType_t::FLOAT);
+    REQUIRE(attributes_positive.get_is_negative_scale() == false);
+
+    auto Y_positive = graph.block_scale_dequantize(X, scale, attributes_positive);
+    Y_positive->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    // Test with is_negative_scale = false (explicit)
+    auto attributes_positive_explicit = fe::graph::Block_scale_dequantize_attributes()
+                                   .set_name("dq_positive_explicit")
+                                   .set_block_size(block_size)
+                                   .set_is_negative_scale(false)
+                                   .set_compute_data_type(fe::DataType_t::FLOAT);
+    REQUIRE(attributes_positive_explicit.get_is_negative_scale() == false);
+
+    auto Y_positive_explicit = graph.block_scale_dequantize(X, scale, attributes_positive_explicit);
+    Y_positive_explicit->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    // Both operations should be successfully added to the graph
+    REQUIRE(Y_negative != nullptr);
+    REQUIRE(Y_positive != nullptr);
+    REQUIRE(Y_positive_explicit != nullptr);
+    REQUIRE(Y_negative->get_name() == "dq_negative::Y");
+    REQUIRE(Y_positive->get_name() == "dq_positive::Y");
+    REQUIRE(Y_positive_explicit->get_name() == "dq_positive_explicit::Y");
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/test/cpp/validate.cpp b/third_party/cudnn-frontend/test/cpp/validate.cpp
new file mode 100644
index 00000000..426f9392
--- /dev/null
+++ b/third_party/cudnn-frontend/test/cpp/validate.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <string>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Validate conv node", "[graph][conv][validate]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes().set_name("image").set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+    auto W = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("filter")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32}));
+
+    auto conv_options = fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto Y            = graph.conv_fprop(X, W, conv_options);
+    Y->set_output(true);
+
+    auto status = graph.validate();
+
+    // Check that error is attribute not set
+    REQUIRE(status.get_code() == fe::error_code_t::ATTRIBUTE_NOT_SET);
+
+    // Check that error message contains name of tensor
+    REQUIRE(status.get_message().find(X->get_name()) != std::string::npos);
+}
+
+TEST_CASE("Move", "[move][graph]") {
+    namespace fe   = cudnn_frontend;
+    auto validate  = [](fe::graph::Graph graph) { REQUIRE(graph.validate().is_good()); };
+    auto construct = []() {
+        fe::graph::Graph graph;
+        REQUIRE(graph.validate().is_good());
+        return graph;
+    };
+    fe::graph::Graph graph = construct();
+    REQUIRE(graph.validate().is_good());
+    validate(std::move(graph));
+}
+
+TEST_CASE("Same uid assignment Error", "[graph][validate]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("image")
+                              .set_dim({8, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                              .set_uid(1));
+    auto W = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("filter")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32}));
+
+    auto conv_options = fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto Y            = graph.conv_fprop(X, W, conv_options);
+    Y->set_output(true).set_uid(1).set_name("response");
+
+    auto status = graph.validate();
+
+    // Check that error is attribute not set
+    REQUIRE(status.get_code() == fe::error_code_t::INVALID_VALUE);
+
+    // Check that error message contains name of tensor
+    REQUIRE(status.get_message().find(Y->get_name()) != std::string::npos);
+}
+
+TEST_CASE("Multiple validation", "[graph][validate]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("image")
+                              .set_dim({8, 32, 16, 16})
+                              .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                              .set_uid(1));
+    auto W = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("filter")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32})
+                              .set_uid(2));
+
+    auto conv_options = fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto Y            = graph.conv_fprop(X, W, conv_options);
+    Y->set_output(true).set_uid(3).set_name("response");
+
+    REQUIRE(graph.validate().is_good());
+    REQUIRE(graph.validate().is_good());
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/test/cpp/version.cpp b/third_party/cudnn-frontend/test/cpp/version.cpp
new file mode 100644
index 00000000..9cbc830a
--- /dev/null
+++ b/third_party/cudnn-frontend/test/cpp/version.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("version checks", "[version]") {
+    namespace fe = cudnn_frontend;
+
+    REQUIRE(fe::detail::convert_version_to_str(8907) == "8.9.7");
+    REQUIRE(fe::detail::convert_version_to_str(90000) == "9.0.0");
+    REQUIRE(fe::detail::convert_version_to_str(90100) == "9.1.0");
+    REQUIRE(fe::detail::convert_version_to_str(123456) == "12.34.56");
+}
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/test/python/conftest.py b/third_party/cudnn-frontend/test/python/conftest.py
new file mode 100644
index 00000000..b5f21db7
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/conftest.py
@@ -0,0 +1,106 @@
+import pytest
+import cudnn
+import torch
+import argparse
+
+# fmt: off
+
+# =================== Fixtures =====================
+@pytest.fixture(scope="session", autouse=True)
+def cudnn_handle():
+    try:
+        _ = cudnn.backend_version()
+    except Exception:
+        # cuDNN not available; do not create a handle so tests not requiring it can run
+        yield None
+        return
+    
+    # Create CUDA stream and graph objects
+    stream = torch.cuda.Stream()
+    cudnn_handle = cudnn.create_handle()
+    cudnn.set_stream(handle=cudnn_handle, stream=stream.cuda_stream)
+    yield cudnn_handle
+    cudnn.destroy_handle(cudnn_handle)
+
+
+# =================== PyTest Hooks =====================
+def pytest_load_initial_conftests(args, early_config, parser):
+    if not any(arg.startswith("--tb=") for arg in args):
+        args.insert(0, "--tb=short")
+    if "--no-header" not in args:
+        args.insert(0, "--no-header")
+
+
+def pytest_configure(config):
+    assert torch.cuda.is_available()
+
+    print("===== cudnn-frontend conftest.py ====")
+    print(f"cuDNN Frontend Version: {cudnn.__version__}")
+    print(f"cuDNN Frontend Path: {cudnn.__file__}")
+    try:
+        print(f"cuDNN Backend Version: {cudnn.backend_version()}")
+    except Exception as e:
+        print(f"cuDNN Backend not available: {e}")
+    print(f"PyTorch Version: {torch.__version__}")
+    print(f"PyTorch Path: {torch.__file__}")
+    print(f"PyTorch GPU Name: {torch.cuda.get_device_name()}")
+    print(f"PyTorch SM Arch Version: {torch.cuda.get_device_capability()}")
+    print(f"PyTorch CUDA Version: {torch.version.cuda}")
+    print(f"PyTorch cuDNN Version: {torch.backends.cudnn.version()}")
+
+# fmt: off
+def pytest_addoption(parser):
+    # Generic options that may be used by all scripts.
+    parser.addoption("--dryrun", action="store", nargs="?", const=1, type=int, default=0, help="show repro commands when 1, 2, or 3 (use with '-s')")
+    parser.addoption("--diffs", action="store", type=int, default=10, help="set number of numerical mismatches to display")
+    parser.addoption("--repro", action="store", type=str, default=None, help="specify config string to run repro function")
+    parser.addoption("--seed", action="store", type=int, default=None, help="[fuzzer] random seed for reproducibility")
+    parser.addoption("--num-tests", action="store", type=int, default=100, help="[fuzzer] number of random tests to run")
+    parser.addoption("--perf", action="store_true", help="enable performance profiling")
+
+    # MHA command line options to overwrite specific test dimensions in test_mhas.py and test_mhas_v2.py.
+    parser.addoption("--b", default=None, type=int, help="[test_mhas.py] batch dimension")
+    parser.addoption("--s_q", default=None, type=int, help="[test_mhas.py] query sequence length")
+    parser.addoption("--s_kv", default=None, type=int, help="[test_mhas.py] key/value sequence length")
+    parser.addoption("--d_qk", default=None, type=int, help="[test_mhas.py] query/key embedding dimension per head")
+    parser.addoption("--d_v", default=None, type=int, help="[test_mhas.py] value embedding dimension per head")
+    parser.addoption("--h_q", default=None, type=int, help="[test_mhas.py] query number of heads")
+    parser.addoption("--h_k", default=None, type=int, help="[test_mhas.py] key number of heads")
+    parser.addoption("--h_v", default=None, type=int, help="[test_mhas.py] value number of heads")
+    parser.addoption("--deterministic", default=None, type=int, choices=[0, 1], help="[test_mhas.py] force deterministic algorithm")
+    parser.addoption("--block_size", default=None, type=int, help="[test_mhas.py] block size for paged attention")
+    parser.addoption("--left_bound", default=None, type=int, help="[test_mhas.py] size of the window to the left of the diagonal")
+    parser.addoption("--right_bound", default=None, type=int, help="[test_mhas.py] size of the window to the right of the diagonal")
+
+    parser.addoption("--implementation", action="store", default=None, type=str, choices=["AUTO", "COMPOSITE", "UNIFIED"], help="[test_mhas_v2.py], overwrites implementation")
+
+    # NSA (Native Sparse Attention) command line options for test_NSA_selection_attention.py, test_NSA_swa.py
+    parser.addoption("--nsa-b", action="store", default=None, type=int, help="[NSA] Batch size")
+    parser.addoption("--nsa-s_q", action="store", default=None, type=int, help="[NSA] Query sequence length")
+    parser.addoption("--nsa-s_kv", action="store", default=None, type=int, help="[NSA] Key/value sequence length")
+    parser.addoption("--nsa-d_qk", action="store", default=None, type=int, help="[NSA] Query/key embedding dimension per head")
+    parser.addoption("--nsa-d_v", action="store", default=None, type=int, help="[NSA] Value embedding dimension per head")
+    parser.addoption("--nsa-h_q", action="store", default=None, type=int, help="[NSA] Number of query heads")
+    parser.addoption("--nsa-h_k", action="store", default=None, type=int, help="[NSA] Number of key heads")
+    parser.addoption("--nsa-h_v", action="store", default=None, type=int, help="[NSA] Number of value heads")
+    parser.addoption("--nsa-skip-ref", action="store_true", help="[NSA] Skip reference computation for performance testing")
+
+    # GEMM SwiGLU command line options for test_gemm_swiglu.py
+    parser.addoption("--gemm-swiglu-mnkl", action="store", default=None, type=str, help="[test_gemm_swiglu.py] M,N,K,L dimensions as comma-separated values (e.g., '256,256,512,1')")
+    parser.addoption("--gemm-swiglu-mma-tiler", action="store", default=None, type=str, help="[test_gemm_swiglu.py] MMA tiler (M,N) dimensions as comma-separated values (e.g., '128,128')")
+    parser.addoption("--gemm-swiglu-cluster-shape", action="store", default=None, type=str, help="[test_gemm_swiglu.py] Cluster shape (M,N) dimensions as comma-separated values (e.g., '1,1')")
+    parser.addoption("--gemm-swiglu-alpha", action="store", default=None, type=float, help="[test_gemm_swiglu.py] Alpha scaling factor")
+    parser.addoption("--gemm-swiglu-skip-ref", action="store_true", help="[test_gemm_swiglu.py] Skip reference computation for performance testing")
+
+    # GEMM Amax command line options for test_gemm_amax.py
+    parser.addoption("--gemm-amax-mnkl", action="store", default=None, type=str, help="[test_gemm_amax.py] M,N,K,L dimensions as comma-separated values (e.g., '512,256,256,1')")
+    parser.addoption("--gemm-amax-mma-tiler", action="store", default=None, type=str, help="[test_gemm_amax.py] MMA tiler (M,N) dimensions as comma-separated values (e.g., '128,128')")
+    parser.addoption("--gemm-amax-cluster-shape", action="store", default=None, type=str, help="[test_gemm_amax.py] Cluster shape (M,N) dimensions as comma-separated values (e.g., '1,1')")
+    parser.addoption("--gemm-amax-skip-ref", action="store_true", help="[test_gemm_amax.py] Skip reference computation for performance testing")
+
+    # Grouped GEMM SwiGLU command line options for test_grouped_gemm_swiglu.py
+    parser.addoption("--grouped-gemm-nkl", action="store", default=None, type=str, help="[test_grouped_gemm_swiglu.py] N,K,L dimensions as comma-separated values (e.g., '512,512,4')")
+    parser.addoption("--grouped-gemm-group-m", action="store", default=None, type=str, help="[test_grouped_gemm_swiglu.py] M values per group as comma-separated values (e.g., '256,512,256,256')")
+    parser.addoption("--grouped-gemm-m-aligned", action="store", default=None, type=int, help="[test_grouped_gemm_swiglu.py] M alignment (e.g., 256)")
+    parser.addoption("--grouped-gemm-skip-ref", action="store_true", help="[test_grouped_gemm_swiglu.py] Skip reference computation for performance testing")
+# fmt: on
diff --git a/third_party/cudnn-frontend/test/python/fe_api/nsa/nsa_reference.py b/third_party/cudnn-frontend/test/python/fe_api/nsa/nsa_reference.py
new file mode 100644
index 00000000..3e658d0d
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/nsa/nsa_reference.py
@@ -0,0 +1,532 @@
+"""
+Reference implementations for NSA (Native Sparse Attention) tests.
+Contains CPU/GPU reference implementations for verification.
+"""
+
+import torch
+import cudnn
+import math
+
+
+def convert_thd_to_bshd(thd_tensor, seq_len: torch.Tensor, s: int):
+    assert thd_tensor.dim() == 3
+    t, h, d = thd_tensor.size()
+
+    if seq_len.dim() == 1:
+        seq_len = seq_len.view(-1, 1, 1, 1)
+    assert seq_len.dim() == 4
+    assert seq_len.size(1) == seq_len.size(2) == seq_len.size(3) == 1
+    b = seq_len.size(0)
+    seq_len = seq_len.flatten()
+
+    bshd_tensor = torch.zeros((b, s, h, d), dtype=thd_tensor.dtype, device=thd_tensor.device)
+
+    cumulative_seq_len = torch.cumsum(seq_len, dim=0) - seq_len
+    for bi in range(b):
+        t_beg = cumulative_seq_len[bi]
+        t_end = t_beg + seq_len[bi]
+        bshd_tensor[bi, : seq_len[bi], :, :] = thd_tensor[t_beg:t_end, :, :]
+
+    # Return a view with layout (b, h, s, d) while keeping strides as if (b, s, h, d)
+    return bshd_tensor.permute(0, 2, 1, 3)
+
+
+def convert_bshd_to_thd(bshd_tensor, seq_len: torch.Tensor, maxT: int):
+    assert bshd_tensor.dim() == 4
+    b, h, s, d = bshd_tensor.size()
+
+    if seq_len.dim() == 1:
+        seq_len = seq_len.view(-1, 1, 1, 1)
+    assert seq_len.dim() == 4
+    assert seq_len.size(1) == seq_len.size(2) == seq_len.size(3) == 1
+    seq_len = seq_len.flatten()
+
+    thd_tensor = torch.zeros((maxT, h, d), dtype=bshd_tensor.dtype, device=bshd_tensor.device)
+
+    # Interpret input as (b, s, h, d) in memory while keeping the (b, h, s, d) layout
+    bshd_base = bshd_tensor.permute(0, 2, 1, 3)
+
+    cumulative_seq_len = torch.cumsum(seq_len, dim=0) - seq_len
+    for bi in range(b):
+        t_beg = cumulative_seq_len[bi]
+        t_end = t_beg + seq_len[bi]
+        thd_tensor[t_beg:t_end, :, :] = bshd_base[bi, : seq_len[bi], :, :]
+
+    return thd_tensor
+
+
+def run_ref_nsa_selection_attention(
+    Q_in,
+    K_in,
+    V_in,
+    O_out,
+    L_out,
+    M_out,
+    seq_lens,
+    block_indices,
+    block_counts,
+    block_size,
+    softmax_scale,
+    dtype=torch.float32,
+):
+    """
+    Reference implementation of NSA selection attention.
+
+    This is a CPU-based reference implementation for verifying the correctness
+    of the CUDA NSA implementation.
+
+    Args:
+        Q_in: Query tensor of shape (T, H_q, D)
+        K_in: Key tensor of shape (T, H_kv, D)
+        V_in: Value tensor of shape (T, H_kv, D_v)
+        O_out: Output tensor of shape (T, H_q, D_v)
+        L_out: Log-sum-exp tensor of shape (T, H_q, 1)
+        M_out: Max values tensor of shape (T, H_q, 1)
+        seq_lens: List of sequence lengths for each batch
+        block_indices: Block indices tensor
+        block_counts: Block counts tensor
+        block_size: Size of each block
+        softmax_scale: Softmax scaling factor
+        dtype: Data type for computation
+
+    Returns:
+        Tuple of (O_out, L_out, M_out) with updated values
+    """
+    # Q.shape: (T, H_q, D) -> (T, h_kv, g, D)
+    # K.shape: (T, H_kv, D) -> (T, h_kv, 1, D)
+    # V.shape: (T, H_kv, D_v) -> (T, h_kv, 1, D_v)
+    # O.shape: (T, H_q, D_v) -> (T, h_kv, g, D_v)
+    # L.shape: (T, H_q, 1) -> (T, h_kv, g)
+    # M.shape: (T, H_q, 1) -> (T, h_kv, g)
+    # seq_lens.shape: (batch_size)
+    # block_indices.shape: (T, h_kv, topk_size)
+    # block_counts.shape: (T, h_kv)
+
+    t, h_q, d = Q_in.shape
+    _, h_kv, d_v = V_in.shape
+
+    head_num_kv = h_kv
+    total_seq_len = t
+    GQA_group_size = h_q // head_num_kv
+
+    Q = Q_in.view(t, h_kv, GQA_group_size, d).to(dtype=dtype)
+    K = K_in.view(t, h_kv, 1, d).to(dtype=dtype)
+    V = V_in.view(t, h_kv, 1, d_v).to(dtype=dtype)
+    O = O_out.view(t, h_kv, GQA_group_size, d_v).to(dtype=dtype)
+    L = L_out.view(t, h_kv, GQA_group_size).to(dtype=torch.float32)
+    M = M_out.view(t, h_kv, GQA_group_size).to(dtype=torch.float32)
+
+    seq_offset = 0
+    for seq_idx, seq_len in enumerate(seq_lens):
+        seq_end = seq_offset + seq_len
+
+        for h in range(h_kv):
+            # Extract Q, K, V for current sequence and head
+            q_seq = Q[seq_offset:seq_end, h, :, :]  # [seq_len, GQA_group_size, d]
+            k_seq = K[seq_offset:seq_end, h, 0, :]  # [seq_len, d]
+            v_seq = V[seq_offset:seq_end, h, 0, :]  # [seq_len, d_v]
+
+            # Step 1: Compute full Q @ K^T attention matrix
+            # q_seq: [seq_len, GQA_group_size, d] @ k_seq.T: [d, seq_len] -> [seq_len, GQA_group_size, seq_len]
+            qk_scores = torch.matmul(q_seq, k_seq.transpose(-2, -1)) * softmax_scale
+
+            # Step 2: Create block selection mask
+            mask = torch.full(
+                (seq_len, seq_len),
+                float("-inf"),
+                device=qk_scores.device,
+                dtype=torch.float32,
+            )
+            seq_block_counts = block_counts[seq_offset:seq_end, h]  # [seq_len]
+            seq_block_indices = block_indices[seq_offset:seq_end, h, :]  # [seq_len, topk_size]
+            topk_size = seq_block_indices.size(-1)
+            block_range = torch.arange(topk_size, device=mask.device).unsqueeze(0)  # [1, topk_size]
+            valid_mask = block_range < seq_block_counts.unsqueeze(1)  # [seq_len, topk_size]
+
+            query_indices, block_indices_flat = torch.where(valid_mask)
+            if len(query_indices) > 0:
+                block_ids = seq_block_indices[query_indices, block_indices_flat]
+                token_starts = block_ids * block_size
+                token_ends = torch.clamp((block_ids + 1) * block_size, max=seq_len)
+
+                block_sizes = token_ends - token_starts
+                max_block_size = block_sizes.max().item() if len(block_sizes) > 0 else 0
+
+                if max_block_size > 0:
+                    offsets = torch.arange(max_block_size, device=mask.device)  # [max_block_size]
+
+                    num_blocks = len(block_ids)
+                    offsets_expanded = offsets.unsqueeze(0).expand(num_blocks, -1)  # [num_blocks, max_block_size]
+                    block_sizes_expanded = block_sizes.unsqueeze(1)  # [num_blocks, 1]
+                    token_starts_expanded = token_starts.unsqueeze(1)  # [num_blocks, 1]
+                    query_indices_expanded = query_indices.unsqueeze(1)  # [num_blocks, 1]
+
+                    position_valid = offsets_expanded < block_sizes_expanded  # [num_blocks, max_block_size]
+
+                    token_positions = token_starts_expanded + offsets_expanded  # [num_blocks, max_block_size]
+
+                    valid_positions = torch.where(position_valid)
+                    if len(valid_positions[0]) > 0:
+                        block_idx_flat = valid_positions[0]
+                        offset_idx = valid_positions[1]
+
+                        final_query_indices = query_indices_expanded[block_idx_flat, 0]
+                        final_key_indices = token_positions[block_idx_flat, offset_idx]
+
+                        mask[final_query_indices, final_key_indices] = 0.0
+
+            # Step 3: Apply mask to attention scores
+            qk_scores_fp32 = qk_scores.float() + mask.unsqueeze(1)  # [seq_len, 1, seq_len] -> [seq_len, GQA_group_size, seq_len]
+
+            # Step 4: Compute softmax
+            qk_max = torch.max(qk_scores_fp32, dim=-1, keepdim=True)[0]  # [seq_len, GQA_group_size, 1]
+            qk_exp = torch.exp(qk_scores_fp32 - qk_max)  # [seq_len, GQA_group_size, seq_len]
+            qk_sum = torch.sum(qk_exp, dim=-1, keepdim=True)  # [seq_len, GQA_group_size, 1]
+            attn_weights = qk_exp / qk_sum  # [seq_len, GQA_group_size, seq_len]
+
+            # Step 5: Compute output O = attention_weights @ V
+            # attn_weights: [seq_len, GQA_group_size, seq_len] @ v_seq: [seq_len, d_v] -> [seq_len, GQA_group_size, d_v]
+            output = torch.matmul(attn_weights, v_seq.float())  # [seq_len, GQA_group_size, d_v]
+
+            # Store results
+            O[seq_offset:seq_end, h, :, :] = output.to(dtype)
+
+            # Store L (sum of exp) and M (max) statistics - reusing computed values
+            # L should store the sum of exponentials (row_sum), not logsumexp, to match reference
+            L[seq_offset:seq_end, h, :] = qk_sum.squeeze(-1)  # [seq_len, GQA_group_size]
+            M[seq_offset:seq_end, h, :] = qk_max.squeeze(-1)  # [seq_len, GQA_group_size]
+
+        seq_offset = seq_end
+
+    return O.view(t, h_q, d_v), L.view(t, h_q, 1), M.view(t, h_q, 1)
+
+
+def run_ref_nsa_compression_attention(
+    Q,
+    K,
+    V,
+    scale_softmax,
+    scale_output,
+    lse_calculation=False,
+    bottom_right_align=False,
+):
+    """
+    Reference implementation for CompressionAttention.
+
+    Args:
+        Q: (B, H_q, S_q, D)
+        K: (B, H_k, S_k, D)
+        V: (B, H_k, S_k, D_v)
+        scale_softmax (float): softmax scale
+        scale_output (float): output scale applied after matmul
+        lse_calculation (bool): whether to compute LSE
+        bottom_right_align (bool): align end of q to end of k (not used here)
+
+    Returns:
+        Tuple[torch.Tensor, Optional[torch.Tensor]]: (O_ref, LSE_ref or None)
+            O_ref: (B, H_q, S_q, D_v)
+            LSE_ref: (B, H_q, S_q) if lse_calculation else None
+    """
+    assert Q.dim() == 4 and K.dim() == 4 and V.dim() == 4
+    b, h_q, s_q, d = Q.shape
+    _, h_k, s_k, d_v = V.shape
+
+    # Handle GQA/MQA head broadcasting
+    if h_q != h_k:
+        repeat_factor = h_q // h_k
+        K = K.repeat_interleave(repeat_factor, dim=1)
+        V = V.repeat_interleave(repeat_factor, dim=1)
+
+    batch_size = Q.size(0)
+    ref_list = []
+    lse_list = [] if lse_calculation else None
+    for batch_idx in range(batch_size):
+        q_i = Q[batch_idx]
+        k_i = K[batch_idx]
+        v_i = V[batch_idx]
+
+        s_i = torch.einsum("hqd,hkd->hqk", q_i, k_i) * scale_softmax
+        s_q_i = q_i.shape[1]
+        s_k_i = k_i.shape[1]
+
+        # Causal compressed mask
+        q_coords = torch.arange(0, s_q_i, device=s_i.device).view(-1, 1)
+        num_compress_blocks = s_k_i
+        stride = max(1, s_q_i // max(1, s_k_i))
+        k_coords = (((torch.arange(0, num_compress_blocks, device=s_i.device) + 1) * stride) - 1).view(1, -1)
+        _mask = k_coords > q_coords
+        s_i = s_i.masked_fill(_mask, -torch.inf)
+
+        if lse_calculation:
+            lse_i = torch.logsumexp(s_i, dim=-1)
+
+        p_i = torch.softmax(s_i, dim=-1)
+        p_i = p_i.masked_fill(_mask, 0)
+
+        ref_i = torch.einsum("hqk,hkd->hqd", p_i, v_i)
+        ref_i = ref_i * scale_output
+        ref_list.append(ref_i)
+        if lse_calculation:
+            lse_list.append(lse_i)
+
+    O_ref = torch.stack(ref_list)
+    if lse_calculation:
+        LSE_ref = torch.stack(lse_list).float()
+    else:
+        LSE_ref = None
+
+    return O_ref, LSE_ref
+
+
+def run_ref_nsa_swa(
+    q,
+    k,
+    v,
+    attn_scale=None,
+    padding=None,
+    left_bound=None,
+    right_bound=None,
+    generate_stats=False,
+    device="cuda",
+):
+    b, h_q, s_q, d_qk = q.shape
+    _, h_k, s_kv, _ = k.shape
+    _, h_v, _, d_v = v.shape
+
+    assert k.shape == (b, h_k, s_kv, d_qk)
+    assert v.shape == (b, h_v, s_kv, d_v)
+
+    # use float32 datatype and math for reference computation
+    q = q.to(dtype=torch.float32, device=device)
+    k = k.to(dtype=torch.float32, device=device)
+    v = v.to(dtype=torch.float32, device=device)
+
+    # expand tensors for GQA and MQA
+    if h_q != h_k:
+        assert h_q % h_k == 0
+        k = k.unsqueeze(2)
+        k = k.expand(-1, -1, h_q // h_k, -1, -1)
+        k = k.reshape(k.size(0), -1, k.size(3), k.size(4))
+    if h_q != h_v:
+        assert h_q % h_v == 0
+        v = v.unsqueeze(2)
+        v = v.expand(-1, -1, h_q // h_v, -1, -1)
+        v = v.reshape(v.size(0), -1, v.size(3), v.size(4))
+
+    if left_bound != None:
+        swa_mask_zero = torch.ones(1, 1, s_q, 1, dtype=torch.bool, device=device)
+        swa_mask_zero[:, :, s_kv + left_bound - 1 :, :] = False
+        q = q * swa_mask_zero
+
+    # generate masks to compute reference values for padding mask (also called variable sequence length)
+    if padding is not None:
+        q_mask = torch.zeros(b, 1, s_q, 1, dtype=torch.bool, device=device)
+        k_mask = torch.zeros(b, 1, s_kv, 1, dtype=torch.bool, device=device)
+        v_mask = torch.zeros(b, 1, s_kv, 1, dtype=torch.bool, device=device)
+        s_mask = torch.zeros(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+        p_mask = torch.zeros(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+        seq_len_q, seq_len_kv = padding
+        for i, (m, n) in enumerate(zip(seq_len_q, seq_len_kv)):
+            q_mask[i, :, m:, :] = True
+            k_mask[i, :, n:, :] = True
+            v_mask[i, :, n:, :] = True
+            s_mask[i, :, :, n:] = True
+            p_mask[i, :, m:, :] = True
+
+        q = q.masked_fill(q_mask, 0.0)
+        k = k.masked_fill(k_mask, 0.0)
+        v = v.masked_fill(v_mask, 0.0)
+
+    s = torch.einsum("bhqd,bhkd->bhqk", q, k)
+    if attn_scale is not None:
+        s = s * attn_scale
+
+    if padding is not None:
+        s = s.masked_fill(s_mask, float("-inf"))
+
+    if right_bound != None:
+        causal_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+        causal_mask.triu_(diagonal=1 + right_bound)
+        s = s.masked_fill(causal_mask, float("-inf"))
+
+    if left_bound != None:
+        swa_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+        swa_mask.tril_(diagonal=-1 * left_bound)
+        swa_mask &= swa_mask_zero.view(s_q, 1)
+        s = s.masked_fill(swa_mask, float("-inf"))
+
+    p = torch.softmax(s, dim=-1)
+
+    if left_bound != None:
+        p = p * swa_mask_zero
+    if padding is not None:
+        p = p.masked_fill(p_mask, 0.0)
+
+    o = torch.einsum("bhqk,bhkd->bhqd", p, v)
+
+    # softmax stats is used for backwards computation
+    if generate_stats:
+        # amax (NOT absolute max) is used here to evenly distribute gradient
+        row_max = torch.amax(s, -1, True)
+        row_exp = torch.exp(s - row_max)
+        row_sum = torch.sum(row_exp, -1, True)
+        stats = row_max + torch.log(row_sum)
+        # stats = stats.squeeze(dim=-1)
+        return o, stats
+
+    return o
+
+
+def check_ref_nsa_selection_attention(
+    Q,
+    K,
+    V,
+    O,
+    L,
+    M,
+    block_indices,
+    block_counts,
+    test_config,
+):
+    if test_config["skip_ref"]:
+        print(
+            f'Skipped reference computation for selection attention with config: b={test_config["b"]}, seq_len={test_config["s_q"]}, h_q={test_config["h_q"]}, h_kv={test_config["h_kv"]}, d={test_config["d"]}'
+        )
+    O_ref = torch.zeros_like(O, dtype=torch.float32)
+    L_ref = torch.zeros_like(L, dtype=torch.float32)
+    M_ref = torch.zeros_like(M, dtype=torch.float32)
+    O_ref, L_ref, M_ref = run_ref_nsa_selection_attention(
+        Q,
+        K,
+        V,
+        O_ref,
+        L_ref,
+        M_ref,
+        test_config["actual_s_q"].cuda(),
+        block_indices,
+        block_counts,
+        test_config["block_size"],
+        test_config["scale_softmax"],
+        dtype=test_config["dtype"],
+    )
+
+    torch.testing.assert_close(O, O_ref, atol=0.01, rtol=1e-05)
+    # torch.testing.assert_close(L, L_ref, atol=0.01, rtol=1e-05)
+    # torch.testing.assert_close(M, M_ref, atol=0.01, rtol=1e-05)
+
+
+def check_ref_nsa_compression_attention(
+    Q,
+    K,
+    V,
+    O,
+    LSE=None,
+    scale_output=1.0,
+    scale_softmax=None,
+    atol=0.01,
+    rtol=1e-05,
+    test_config=None,
+):
+    if test_config["skip_ref"]:
+        print(
+            f'Skipped reference computation for compression attention with config: b={test_config["b"]}, seq_len={test_config["s_q"]}, h_q={test_config["h_q"]}, h_k={test_config["h_k"]}, d={test_config["d"]}'
+        )
+        return
+    scale_softmax = (
+        scale_softmax if scale_softmax is not None else (test_config["scale_softmax"] if test_config is not None else 1.0 / math.sqrt(test_config["d_qk"]))
+    )
+
+    if test_config["layout"] == "thd":
+        assert "actual_s_q" in test_config, "actual_s_q is required when using T,H,D layout"
+        seq_len_q = test_config["actual_s_q"].to(device=Q.device)
+        max_seq_len_q = int(seq_len_q.max().item())
+
+        # Convert THD -> (B, H, S, D)
+        q_bshd = convert_thd_to_bshd(Q, seq_len_q, max_seq_len_q)
+        k_bshd = convert_thd_to_bshd(K, seq_len_q, max_seq_len_q)
+        v_bshd = convert_thd_to_bshd(V, seq_len_q, max_seq_len_q)
+
+        O_ref_bshd, LSE_ref_bsh = run_ref_nsa_compression_attention(
+            q_bshd,
+            k_bshd,
+            v_bshd,
+            scale_softmax=scale_softmax,
+            scale_output=scale_output,
+            lse_calculation=LSE is not None,
+        )
+
+        # Convert O_ref back to THD for comparison
+        total_T = int(seq_len_q.sum().item())
+        O_ref_thd = convert_bshd_to_thd(O_ref_bshd, seq_len_q, total_T).to(dtype=O.dtype)
+        torch.testing.assert_close(O, O_ref_thd, atol=atol, rtol=rtol)
+
+        if LSE is not None:
+            LSE_bhs1 = LSE_ref_bsh.unsqueeze(-1)
+            LSE_thd = convert_bshd_to_thd(LSE_bhs1, seq_len_q, total_T)
+            torch.testing.assert_close(LSE, LSE_thd, atol=atol, rtol=rtol)
+    elif test_config["layout"] == "bshd":
+        O_ref, LSE_ref = run_ref_nsa_compression_attention(
+            Q,
+            K,
+            V,
+            scale_softmax=scale_softmax,
+            scale_output=scale_output,
+            lse_calculation=LSE is not None,
+        )
+
+        torch.testing.assert_close(O, O_ref.to(dtype=O.dtype), atol=atol, rtol=rtol)
+        if LSE is not None:
+            if (LSE.ndim == LSE_ref.ndim + 1) and (LSE.shape[-1] == 1):
+                LSE_ref = LSE_ref.unsqueeze(-1)
+            torch.testing.assert_close(LSE, LSE_ref, atol=atol, rtol=rtol)
+    else:
+        raise ValueError(f"Invalid layout: {test_config['layout']}")
+
+
+def check_ref_nsa_swa(
+    Q,
+    K,
+    V,
+    O,
+    Stats=None,
+    seq_len_q=None,
+    seq_len_kv=None,
+    max_seq_len_q=None,
+    max_seq_len_kv=None,
+    test_config=None,
+):
+    if test_config is not None and test_config["skip_ref"]:
+        print(
+            f'Skipped reference computation for SWA with config: b={test_config["b"]}, seq_len={test_config["s_q"]}, h_q={test_config["h_q"]}, h_kv={test_config["h_kv"]}, d={test_config["d"]}'
+        )
+        return
+    q_ref, k_ref, v_ref, o_ref, stats_ref = None, None, None, None, None
+
+    if test_config["layout"] == "thd":
+        q_ref = convert_thd_to_bshd(Q, seq_len_q, max_seq_len_q).float()
+        k_ref = convert_thd_to_bshd(K, seq_len_kv, max_seq_len_kv).float()
+        v_ref = convert_thd_to_bshd(V, seq_len_kv, max_seq_len_kv).float()
+    else:
+        q_ref = Q.float()
+        k_ref = K.float()
+        v_ref = V.float()
+    o_ref, stats_ref = run_ref_nsa_swa(
+        q_ref,
+        k_ref,
+        v_ref,
+        attn_scale=test_config["scale_softmax"],
+        padding=(seq_len_q, seq_len_kv) if test_config["layout"] == "thd" else None,
+        left_bound=test_config["window_size"],
+        right_bound=0,
+        generate_stats=True,
+    )
+
+    if test_config["layout"] == "thd":
+        total_seq_len_q = torch.sum(seq_len_q).item()
+        o_ref = convert_bshd_to_thd(o_ref, seq_len_q, total_seq_len_q)
+        stats_ref = convert_bshd_to_thd(stats_ref, seq_len_q, total_seq_len_q)
+
+    o_ref = o_ref.to(dtype=test_config["dtype"])
+
+    torch.testing.assert_close(O, o_ref, atol=0.01, rtol=1e-05)
+    torch.testing.assert_close(Stats, stats_ref, atol=0.01, rtol=1e-05)
diff --git a/third_party/cudnn-frontend/test/python/fe_api/nsa/nsa_utils.py b/third_party/cudnn-frontend/test/python/fe_api/nsa/nsa_utils.py
new file mode 100644
index 00000000..957cd5cf
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/nsa/nsa_utils.py
@@ -0,0 +1,352 @@
+"""
+Utilities for NSA (Native Sparse Attention) tests.
+"""
+
+import torch
+import pytest
+from typing import Optional, Tuple
+import math
+
+# Common parameterization marks for all NSA tests
+NSA_PARAM_MARKS = [
+    pytest.mark.parametrize("layout", ["bshd", "thd"]),
+    pytest.mark.parametrize("dtype", [torch.float16]),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+]
+
+# Parameterization marks for NSA Top-K Reduction tests
+NSA_TOPK_REDUCTION_PARAM_MARKS = [
+    pytest.mark.parametrize("selection_block_size", [64]),
+    pytest.mark.parametrize("compress_stride", [32]),
+    pytest.mark.parametrize("k_value", [16]),
+    pytest.mark.parametrize("is_causal", [True]),
+    pytest.mark.parametrize("mma_tiler_mn", [(128, 128)]),
+]
+
+# Parameterization marks for NSA Compression Attention tests
+NSA_COMPRESSION_ATTENTION_PARAM_MARKS = [
+    pytest.mark.parametrize("mma_tiler_mn", [(128, 128)]),
+    pytest.mark.parametrize("is_persistent", [False]),
+    pytest.mark.parametrize("scale_q", [1.0]),
+    pytest.mark.parametrize("scale_k", [1.0]),
+    pytest.mark.parametrize("scale_v", [1.0]),
+    pytest.mark.parametrize("inv_scale_o", [1.0]),
+    pytest.mark.parametrize("scale_softmax", [None]),
+]
+
+# Parameterization marks for NSA Sliding Window Attention tests
+NSA_SWA_PARAM_MARKS = [
+    pytest.mark.parametrize("window_size", [64, 512]),
+    pytest.mark.parametrize("scale_softmax", [None]),
+]
+
+# Parameterization marks for NSA Selection Attention tests
+NSA_SELECTION_ATTENTION_PARAM_MARKS = [
+    pytest.mark.parametrize("topk_size", [16]),
+    pytest.mark.parametrize("block_size", [64]),
+]
+
+
+def with_nsa_topk_reduction_params(func):
+    for mark in reversed(NSA_PARAM_MARKS + NSA_TOPK_REDUCTION_PARAM_MARKS):
+        func = mark(func)
+    return func
+
+
+def with_nsa_compression_attention_params(func):
+    for mark in reversed(NSA_PARAM_MARKS + NSA_COMPRESSION_ATTENTION_PARAM_MARKS):
+        func = mark(func)
+    return func
+
+
+def with_nsa_swa_params(func):
+    for mark in reversed(NSA_PARAM_MARKS + NSA_SWA_PARAM_MARKS):
+        func = mark(func)
+    return func
+
+
+def with_nsa_selection_attention_params(func):
+    for mark in reversed(NSA_PARAM_MARKS + NSA_SELECTION_ATTENTION_PARAM_MARKS):
+        func = mark(func)
+    return func
+
+
+def nsa_init(
+    request: pytest.FixtureRequest,
+    layout: str = "bshd",
+    dtype: Optional[torch.dtype] = None,
+    acc_dtype: Optional[torch.dtype] = None,
+    selection_block_size: Optional[int] = None,
+    compress_stride: Optional[int] = None,
+    k_value: Optional[int] = None,
+    is_causal: Optional[bool] = None,
+    mma_tiler_mn: Optional[Tuple[int, int]] = None,
+    is_persistent: Optional[bool] = None,
+    scale_q: Optional[float] = None,
+    scale_k: Optional[float] = None,
+    scale_v: Optional[float] = None,
+    inv_scale_o: Optional[float] = None,
+    scale_softmax: Optional[float] = None,
+    window_size: Optional[int] = None,
+    topk_size: Optional[int] = None,
+    block_size: Optional[int] = None,
+    s_q_default_override: Optional[int] = None,
+    s_kv_default_override: Optional[int] = None,
+):
+    major, _ = torch.cuda.get_device_capability()
+    if major < 10:
+        pytest.skip(f"Environment not supported: requires compute capability >= 10, found {major}")
+
+    b = int(request.config.getoption("--nsa-b")) if request.config.getoption("--nsa-b") is not None else 2
+    s_q = (
+        int(request.config.getoption("--nsa-s_q"))
+        if request.config.getoption("--nsa-s_q") is not None
+        else 1024 if s_q_default_override is None else s_q_default_override
+    )
+    s_kv = (
+        int(request.config.getoption("--nsa-s_kv"))
+        if request.config.getoption("--nsa-s_kv") is not None
+        else 1024 if s_kv_default_override is None else s_kv_default_override
+    )
+    d_qk = int(request.config.getoption("--nsa-d_qk")) if request.config.getoption("--nsa-d_qk") is not None else 128
+    d_v = int(request.config.getoption("--nsa-d_v")) if request.config.getoption("--nsa-d_v") is not None else 128
+    h_q = int(request.config.getoption("--nsa-h_q")) if request.config.getoption("--nsa-h_q") is not None else 4
+    h_k = int(request.config.getoption("--nsa-h_k")) if request.config.getoption("--nsa-h_k") is not None else 1
+    h_v = int(request.config.getoption("--nsa-h_v")) if request.config.getoption("--nsa-h_v") is not None else 1
+
+    actual_s_q = torch.tensor([s_q] * b, dtype=torch.int32).cuda() if layout == "thd" else None
+    actual_s_kv = torch.tensor([s_kv] * b, dtype=torch.int32).cuda() if layout == "thd" else None
+    topk_sizes = torch.tensor([topk_size] * b, dtype=torch.int32).cuda() if (layout == "thd" and topk_size is not None) else None
+
+    scale_softmax = 1.0 / math.sqrt(d_qk) if scale_softmax is None else scale_softmax
+
+    skip_ref = request.config.getoption("--nsa-skip-ref", default=False)
+
+    return {
+        "b": b,
+        "s_q": s_q,
+        "s_kv": s_kv,
+        "d_qk": d_qk,
+        "d_v": d_v,
+        "h_q": h_q,
+        "h_k": h_k,
+        "h_v": h_v,
+        "actual_s_q": actual_s_q,
+        "actual_s_kv": actual_s_kv,
+        "layout": layout,
+        "dtype": dtype,
+        "acc_dtype": acc_dtype,
+        "selection_block_size": selection_block_size,
+        "compress_stride": compress_stride,
+        "k_value": k_value,
+        "is_causal": is_causal,
+        "mma_tiler_mn": mma_tiler_mn,
+        "is_persistent": is_persistent,
+        "scale_q": scale_q,
+        "scale_k": scale_k,
+        "scale_v": scale_v,
+        "inv_scale_o": inv_scale_o,
+        "scale_softmax": scale_softmax,
+        "skip_ref": skip_ref,
+        "window_size": window_size,
+        "topk_size": topk_size,
+        "topk_sizes": topk_sizes,
+        "block_size": block_size,
+    }
+
+
+def allocate_input_tensors(cfg):
+    layout = cfg["layout"]
+
+    b = cfg["b"]
+    s_q = cfg["s_q"]
+    s_kv = cfg["s_kv"]
+    d_qk = cfg["d_qk"]
+    d_v = cfg["d_v"]
+    h_q = cfg["h_q"]
+    h_k = cfg["h_k"]
+    h_v = cfg["h_v"]
+    actual_s_q = cfg["actual_s_q"]
+    actual_s_kv = cfg["actual_s_kv"]
+
+    dtype = cfg["dtype"]
+    acc_dtype = cfg["acc_dtype"]
+    selection_block_size = cfg["selection_block_size"]
+    compress_stride = cfg["compress_stride"]
+    k_value = cfg["k_value"]
+
+    (
+        Q,
+        K,
+        V,
+        LSE,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        max_s_q,
+        max_s_kv,
+    ) = (None, None, None, None, None, None, None, None)
+    if layout == "bshd":
+        Q = torch.randn(b, s_q, h_q, d_qk, dtype=dtype).transpose(1, 2).cuda()
+        K = torch.randn(b, s_kv, h_k, d_qk, dtype=dtype).transpose(1, 2).cuda()
+        V = torch.randn(b, s_kv, h_k, d_v, dtype=dtype).transpose(1, 2).cuda()
+        LSE = -1.0 * torch.randn(b, s_q, h_q, dtype=torch.float32).transpose(1, 2).contiguous().cuda()
+
+        block_counts, block_indices = None, None  # TODO
+    elif layout == "thd":
+        cum_seqlen_q = torch.cat([torch.tensor([0]).cuda(), torch.cumsum(actual_s_q, dim=0)]).to(torch.int32).cuda()
+        cum_seqlen_kv = torch.cat([torch.tensor([0]).cuda(), torch.cumsum(actual_s_kv, dim=0)]).to(torch.int32).cuda()
+        max_s_q = max(actual_s_q).item()
+        max_s_kv = max(actual_s_kv).item()
+
+        total_seq_len_q = max(actual_s_q.sum().item(), actual_s_kv.sum().item())
+        total_seq_len_kv = actual_s_kv.sum().item()
+        # Q: (T, H_q, D_qk)
+        Q = torch.randn((total_seq_len_q, h_q, d_qk), dtype=dtype).cuda()
+        # K: (T, H_kv, D_qk)
+        K = torch.randn((total_seq_len_kv, h_k, d_qk), dtype=dtype).cuda()
+        # V: (T, H_kv, D_v)
+        V = torch.randn((total_seq_len_kv, h_k, d_v), dtype=dtype).cuda()
+        # LSE: (T, H_q, 1)
+        LSE = -1.0 * torch.randn((1, h_q, total_seq_len_q), dtype=torch.float32).transpose(0, 2).cuda()
+
+        # block_counts: (T, H_kv), block_indices: (T, H_kv, max(topk_sizes))
+        block_counts, block_indices = None, None  # TODO
+
+    return (
+        Q,
+        K,
+        V,
+        LSE,
+        actual_s_q,
+        actual_s_kv,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        max_s_q,
+        max_s_kv,
+    )
+
+
+def allocate_output_tensors(cfg):
+    layout = cfg["layout"]
+    b = cfg["b"]
+    s_q = cfg["s_q"]
+    actual_s_q = cfg["actual_s_q"]
+    h_q = cfg["h_q"]
+    d_v = cfg["d_v"]
+    h_k = cfg["h_k"]
+    k_value = cfg["k_value"]
+    acc_dtype = cfg["acc_dtype"]
+    dtype = cfg["dtype"]
+
+    O, L, M = None, None, None
+    topk_scores, topk_indices = None, None
+    if layout == "bshd":
+        O = torch.empty(b, s_q, h_q, d_v, dtype=dtype).transpose(1, 2).cuda()
+        L = torch.empty(b, s_q, h_q, 1, dtype=torch.float32).transpose(1, 2).cuda()
+        M = torch.empty(b, s_q, h_q, 1, dtype=torch.float32).transpose(1, 2).cuda()
+
+        if k_value is not None:
+            topk_scores = torch.empty(b, s_q, h_k, k_value, dtype=acc_dtype).transpose(1, 2).cuda()
+            topk_indices = torch.empty(b, s_q, h_k, k_value, dtype=torch.int32).transpose(1, 2).cuda()
+    elif layout == "thd":
+        total_seq_len = actual_s_q.sum().item()
+
+        O = torch.empty(total_seq_len, h_q, d_v, dtype=dtype).cuda()
+        L = torch.empty(total_seq_len, h_q, 1, dtype=torch.float32).cuda()
+        M = torch.empty(total_seq_len, h_q, 1, dtype=torch.float32).cuda()
+
+        if k_value is not None:
+            topk_scores = torch.empty(total_seq_len, h_k, k_value, dtype=acc_dtype).cuda()
+            topk_indices = torch.empty(total_seq_len, h_k, k_value, dtype=torch.int32).cuda()
+
+    return (
+        O,
+        L,
+        M,
+        topk_scores,
+        topk_indices,
+    )
+
+
+def _compute_exclusive_prefix_sum(tensor):
+    assert list(tensor.size())[1:] == [1, 1, 1]
+    # We need to provide a tuple of two tensors to torch.cat().
+    return torch.cat(
+        (
+            torch.zeros(1, 1, 1, 1, dtype=tensor.dtype, device=tensor.device),
+            torch.cumsum(tensor, dim=0),
+        )
+    )
+
+
+def generate_ragged_offset(cfg):
+    if cfg["layout"] != "thd":
+        return None, None, None, None, None
+
+    h_q = cfg["h_q"]
+    h_k = cfg["h_k"]
+    h_v = cfg["h_v"]
+    d_qk = cfg["d_qk"]
+    d_v = cfg["d_v"]
+    seq_len_q = cfg["actual_s_q"]
+    seq_len_kv = cfg["actual_s_kv"]
+
+    # Only for thd_thd_thd
+    if seq_len_q.ndim == 1:
+        seq_len_q = seq_len_q.view(-1, 1, 1, 1)
+    if seq_len_kv.ndim == 1:
+        seq_len_kv = seq_len_kv.view(-1, 1, 1, 1)
+
+    q_ragged_offset = _compute_exclusive_prefix_sum(seq_len_q) * h_q * d_qk
+    k_ragged_offset = _compute_exclusive_prefix_sum(seq_len_kv) * h_k * d_qk
+    v_ragged_offset = _compute_exclusive_prefix_sum(seq_len_kv) * h_v * d_v
+    o_ragged_offset = _compute_exclusive_prefix_sum(seq_len_q) * h_q * d_v
+    stats_ragged_offset = _compute_exclusive_prefix_sum(seq_len_q) * h_q
+
+    # Convert to int64 for cuDNN 9.6.0
+    q_ragged_offset = q_ragged_offset.to(dtype=torch.int64).cuda()
+    k_ragged_offset = k_ragged_offset.to(dtype=torch.int64).cuda()
+    v_ragged_offset = v_ragged_offset.to(dtype=torch.int64).cuda()
+    o_ragged_offset = o_ragged_offset.to(dtype=torch.int64).cuda()
+    stats_ragged_offset = stats_ragged_offset.to(dtype=torch.int64).cuda()
+
+    return (
+        q_ragged_offset,
+        k_ragged_offset,
+        v_ragged_offset,
+        o_ragged_offset,
+        stats_ragged_offset,
+    )
+
+
+def generate_block_indices(seq_lens: list[int], num_kv_heads: int, topk_sizes: list[int], block_size: int):
+    """
+    Generate block indices and counts for sparse attention.
+
+    Args:
+        seq_lens: List of sequence lengths for each batch
+        num_kv_heads: Number of key/value heads
+        topk_sizes: List of top-k sizes for each batch
+        block_size: Size of each block
+
+    Returns:
+        Tuple of (block_counts, block_indices) tensors on CUDA
+    """
+    total_seq_len = sum(seq_lens)
+    max_topk_size = max(topk_sizes)
+    block_counts = torch.zeros(total_seq_len, num_kv_heads, dtype=torch.int32)
+    block_indices = torch.zeros(total_seq_len, num_kv_heads, max_topk_size, dtype=torch.int32)
+
+    seq_len_offset = 0
+    for i in range(len(seq_lens)):
+        seq_len = seq_lens[i]
+        topk_size = topk_sizes[i]
+        max_index = seq_len // block_size
+        assert topk_size <= max_index, "topk_size must be less than or equal to the number of blocks"
+        for t in range(seq_len):
+            for h in range(num_kv_heads):
+                block_indices[seq_len_offset + t, h, :topk_size] = torch.randperm(max_index)[:topk_size].sort().values
+                block_counts[seq_len_offset + t, h] = topk_size
+        seq_len_offset += seq_len
+
+    return block_counts.cuda(), block_indices.cuda()
diff --git a/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_compression_attention.py b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_compression_attention.py
new file mode 100644
index 00000000..fff4997e
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_compression_attention.py
@@ -0,0 +1,176 @@
+import torch
+
+import pytest
+from test_utils import torch_fork_set_rng
+
+from fe_api.nsa.nsa_utils import (
+    with_nsa_compression_attention_params,
+    nsa_init,
+    allocate_input_tensors,
+    allocate_output_tensors,
+)
+from fe_api.nsa.nsa_reference import check_ref_nsa_compression_attention
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_compression_attention_params
+def test_nsa_compression_compile_execute(
+    layout,
+    dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    is_persistent,
+    scale_q,
+    scale_k,
+    scale_v,
+    inv_scale_o,
+    scale_softmax,
+    request,
+):
+    try:
+        from cudnn import NSA
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        is_persistent=is_persistent,
+        scale_q=scale_q,
+        scale_k=scale_k,
+        scale_v=scale_v,
+        inv_scale_o=inv_scale_o,
+        scale_softmax=scale_softmax,
+    )
+
+    Q, K, V, _, _, _, cum_seqlen_q, cum_seqlen_k, max_s_q, max_s_k = allocate_input_tensors(cfg)
+    O, LSE, _, _, _ = allocate_output_tensors(cfg)
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    if cfg["layout"] == "bshd":
+        LSE = LSE.contiguous()
+    elif cfg["layout"] == "thd":
+        LSE = LSE.permute(2, 1, 0).contiguous().permute(2, 1, 0)
+
+    comp_attn = NSA.CompressionAttention(
+        sample_q=Q,
+        sample_k=K,
+        sample_v=V,
+        sample_o=O,
+        sample_lse=LSE,
+        sample_cum_seqlen_q=cum_seqlen_q,
+        sample_cum_seqlen_k=cum_seqlen_k,
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        qk_acc_dtype=cfg["acc_dtype"],
+        pv_acc_dtype=cfg["acc_dtype"],
+        is_persistent=cfg["is_persistent"],
+        scale_q=cfg["scale_q"],
+        scale_k=cfg["scale_k"],
+        scale_v=cfg["scale_v"],
+        inv_scale_o=cfg["inv_scale_o"],
+        scale_softmax=cfg["scale_softmax"],
+    )
+
+    assert comp_attn.check_support()
+    comp_attn.compile(current_stream=stream)
+    comp_attn.execute(
+        q_tensor=Q,
+        k_tensor=K,
+        v_tensor=V,
+        o_tensor=O,
+        lse_tensor=LSE,
+        cum_seqlen_q_tensor=cum_seqlen_q,
+        cum_seqlen_k_tensor=cum_seqlen_k,
+        scale_softmax=cfg["scale_softmax"],
+        current_stream=stream,
+    )
+
+    check_ref_nsa_compression_attention(
+        Q,
+        K,
+        V,
+        O,
+        LSE,
+        scale_output=1.0,
+        scale_softmax=cfg["scale_softmax"],
+        atol=2e-3,
+        rtol=2e-3,
+        test_config=cfg,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_compression_attention_params
+def test_nsa_compression_wrapper(
+    layout,
+    dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    is_persistent,
+    scale_q,
+    scale_k,
+    scale_v,
+    inv_scale_o,
+    scale_softmax,
+    request,
+):
+    try:
+        from cudnn import NSA
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        is_persistent=is_persistent,
+        scale_q=scale_q,
+        scale_k=scale_k,
+        scale_v=scale_v,
+        inv_scale_o=inv_scale_o,
+        scale_softmax=scale_softmax,
+    )
+
+    Q, K, V, _, _, _, cum_seqlen_q, cum_seqlen_k, max_s_q, max_s_k = allocate_input_tensors(cfg)
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    O, LSE = NSA.compression_attention_wrapper(
+        q_tensor=Q,
+        k_tensor=K,
+        v_tensor=V,
+        cum_seqlen_q_tensor=cum_seqlen_q,
+        cum_seqlen_k_tensor=cum_seqlen_k,
+        enable_lse=True,
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        o_dtype=cfg["dtype"],
+        qk_acc_dtype=cfg["acc_dtype"],
+        pv_acc_dtype=cfg["acc_dtype"],
+        is_persistent=cfg["is_persistent"],
+        scale_q=cfg["scale_q"],
+        scale_k=cfg["scale_k"],
+        scale_v=cfg["scale_v"],
+        inv_scale_o=cfg["inv_scale_o"],
+        scale_softmax=cfg["scale_softmax"],
+    )
+
+    check_ref_nsa_compression_attention(
+        Q,
+        K,
+        V,
+        O,
+        LSE,
+        scale_output=1.0,
+        scale_softmax=cfg["scale_softmax"],
+        atol=2e-3,
+        rtol=2e-3,
+        test_config=cfg,
+    )
diff --git a/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_selection_attention.py b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_selection_attention.py
new file mode 100644
index 00000000..105bfd15
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_selection_attention.py
@@ -0,0 +1,172 @@
+import torch
+
+import pytest
+from test_utils import torch_fork_set_rng
+
+from fe_api.nsa.nsa_utils import (
+    with_nsa_selection_attention_params,
+    nsa_init,
+    generate_block_indices,
+    allocate_input_tensors,
+    allocate_output_tensors,
+)
+from fe_api.nsa.nsa_reference import check_ref_nsa_selection_attention
+
+"""
+SelectionAttention API with explicitset_params, compile, and execute paths. 
+Use this method when running one static configuration for each FmhaCute object.
+"""
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_selection_attention_params
+def test_nsa_selection_compile_execute(
+    layout,
+    dtype,
+    acc_dtype,
+    topk_size,
+    block_size,
+    request,
+):
+    try:
+        from cudnn import NSA
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    if layout != "thd":
+        pytest.skip("Only THD layout supported for selection attention, bshd layout not yet implemented")
+
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        topk_size=topk_size,
+        block_size=block_size,
+    )
+
+    Q, K, V, _, actual_s_q, _, cum_seqlen_q, cum_seqlen_kv, max_s_q, max_s_kv = allocate_input_tensors(cfg)
+    block_counts, block_indices = generate_block_indices(
+        cfg["actual_s_q"],
+        cfg["h_k"],
+        cfg["topk_sizes"],
+        cfg["block_size"],
+    )
+    O, L, M, _, _ = allocate_output_tensors(cfg)
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    selection_attention = NSA.SelectionAttention(
+        sample_q=Q,
+        sample_k=K,
+        sample_v=V,
+        sample_o=O,
+        sample_l=L,
+        sample_m=M,
+        sample_block_indices=block_indices,
+        sample_block_counts=block_counts,
+        sample_cum_seqlen_q=cum_seqlen_q,
+        sample_cum_seqlen_k=cum_seqlen_kv,
+        max_s_q=max_s_q,
+        max_s_k=max_s_kv,
+        acc_dtype=cfg["acc_dtype"],
+        block_size=cfg["block_size"],
+        scale_softmax=cfg["scale_softmax"],
+    )
+    assert selection_attention.check_support()
+    selection_attention.compile(current_stream=stream)
+    selection_attention.execute(
+        q_tensor=Q,
+        k_tensor=K,
+        v_tensor=V,
+        o_tensor=O,
+        l_tensor=L,
+        m_tensor=M,
+        block_indices_tensor=block_indices,
+        block_counts_tensor=block_counts,
+        cum_seqlen_q_tensor=cum_seqlen_q,
+        cum_seqlen_k_tensor=cum_seqlen_kv,
+        scale_softmax=cfg["scale_softmax"],
+        current_stream=stream,
+    )
+    check_ref_nsa_selection_attention(
+        Q,
+        K,
+        V,
+        O,
+        L,
+        M,
+        block_indices,
+        block_counts,
+        cfg,
+    )
+
+
+"""
+SelectionAttention API with selection_attention_wrapper:
+Use the wrapper to directly call SelectionAttention without explicit setup and compilation.
+"""
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_selection_attention_params
+def test_nsa_selection_wrapper(
+    layout,
+    dtype,
+    acc_dtype,
+    topk_size,
+    block_size,
+    request,
+):
+    try:
+        from cudnn import NSA
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    if layout != "thd":
+        pytest.skip("Only THD layout supported for selection attention, bshd layout not yet implemented")
+
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        topk_size=topk_size,
+        block_size=block_size,
+    )
+
+    Q, K, V, _, actual_s_q, _, cum_seqlen_q, cum_seqlen_kv, max_s_q, max_s_kv = allocate_input_tensors(cfg)
+    block_counts, block_indices = generate_block_indices(
+        cfg["actual_s_q"],
+        cfg["h_k"],
+        cfg["topk_sizes"],
+        cfg["block_size"],
+    )
+
+    O, L, M = NSA.selection_attention_wrapper(
+        q_tensor=Q,
+        k_tensor=K,
+        v_tensor=V,
+        block_indices_tensor=block_indices,
+        block_counts_tensor=block_counts,
+        cum_seqlen_q_tensor=cum_seqlen_q,
+        cum_seqlen_k_tensor=cum_seqlen_kv,
+        block_size=cfg["block_size"],
+        scale_softmax=cfg["scale_softmax"],
+        o_dtype=cfg["dtype"],
+        acc_dtype=cfg["acc_dtype"],
+    )
+
+    check_ref_nsa_selection_attention(
+        Q,
+        K,
+        V,
+        O,
+        L,
+        M,
+        block_indices,
+        block_counts,
+        cfg,
+    )
diff --git a/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_swa.py b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_swa.py
new file mode 100644
index 00000000..d9522321
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_swa.py
@@ -0,0 +1,167 @@
+import torch
+import cudnn
+
+import pytest
+from test_utils import torch_fork_set_rng
+
+from fe_api.nsa.nsa_utils import (
+    nsa_init,
+    allocate_input_tensors,
+    allocate_output_tensors,
+    with_nsa_swa_params,
+    generate_ragged_offset,
+)
+
+from fe_api.nsa.nsa_reference import check_ref_nsa_swa
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_swa_params
+def test_nsa_swa_compile_execute(
+    layout,
+    dtype,
+    acc_dtype,
+    window_size,
+    scale_softmax,
+    request,
+):
+    try:
+        from cudnn import NSA
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        scale_softmax=scale_softmax,
+        window_size=window_size,
+    )
+
+    (
+        Q,
+        K,
+        V,
+        _,
+        actual_s_q,
+        actual_s_kv,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        max_s_q,
+        max_s_kv,
+    ) = allocate_input_tensors(cfg)
+
+    O, Stats, _, _, _ = allocate_output_tensors(cfg)
+    cudnn_handle = cudnn.create_handle()
+
+    swa = NSA.SlidingWindowAttention(
+        sample_q=Q,
+        sample_k=K,
+        sample_v=V,
+        sample_o=O,
+        sample_stats=Stats,
+        sample_seq_len_q=actual_s_q,
+        sample_seq_len_kv=actual_s_kv,
+        max_seq_len_q=max_s_q,
+        max_seq_len_kv=max_s_kv,
+        left_bound=cfg["window_size"],
+        right_bound=0,
+        attn_scale=cfg["scale_softmax"],
+        intermediate_data_type=cfg["acc_dtype"],
+        compute_data_type=cfg["acc_dtype"],
+        cudnn_handle=cudnn_handle,
+    )
+
+    assert swa.check_support() is True
+    swa.compile()
+    swa.execute(
+        q_tensor=Q,
+        k_tensor=K,
+        v_tensor=V,
+        seq_len_q_tensor=actual_s_q,
+        seq_len_kv_tensor=actual_s_kv,
+        o_tensor=O,
+        stats_tensor=Stats,
+    )
+
+    check_ref_nsa_swa(
+        Q,
+        K,
+        V,
+        O,
+        Stats,
+        actual_s_q,
+        actual_s_kv,
+        max_s_q,
+        max_s_kv,
+        cfg,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_swa_params
+def test_nsa_swa_wrapper(
+    layout,
+    dtype,
+    acc_dtype,
+    window_size,
+    scale_softmax,
+    request,
+):
+    try:
+        from cudnn import NSA
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        scale_softmax=scale_softmax,
+        window_size=window_size,
+    )
+
+    (
+        Q,
+        K,
+        V,
+        _,
+        actual_s_q,
+        actual_s_kv,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        max_s_q,
+        max_s_kv,
+    ) = allocate_input_tensors(cfg)
+    cudnn_handle = cudnn.create_handle()
+
+    O, Stats = NSA.sliding_window_attention_wrapper(
+        q_tensor=Q,
+        k_tensor=K,
+        v_tensor=V,
+        seq_len_q_tensor=actual_s_q,
+        seq_len_kv_tensor=actual_s_kv,
+        left_bound=cfg["window_size"],
+        right_bound=0,
+        is_infer=False,
+        attn_scale=cfg["scale_softmax"],
+        o_dtype=cfg["dtype"],
+        intermediate_data_type=cfg["acc_dtype"],
+        compute_data_type=cfg["acc_dtype"],
+        cudnn_handle=cudnn_handle,
+    )
+
+    check_ref_nsa_swa(
+        Q,
+        K,
+        V,
+        O,
+        Stats,
+        actual_s_q,
+        actual_s_kv,
+        max_s_q,
+        max_s_kv,
+        cfg,
+    )
diff --git a/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_topk_reduction.py b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_topk_reduction.py
new file mode 100644
index 00000000..e17d747d
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/nsa/test_NSA_topk_reduction.py
@@ -0,0 +1,142 @@
+import torch
+
+import pytest
+
+from test_utils import torch_fork_set_rng
+
+from fe_api.nsa.nsa_utils import (
+    with_nsa_topk_reduction_params,
+    nsa_init,
+    allocate_input_tensors,
+    allocate_output_tensors,
+)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_topk_reduction_params
+def test_nsa_topk_reduction_compile_execute(
+    layout,
+    dtype,
+    acc_dtype,
+    selection_block_size,
+    compress_stride,
+    k_value,
+    is_causal,
+    mma_tiler_mn,
+    request,
+):
+    try:
+        from cudnn import NSA
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        selection_block_size=selection_block_size,
+        compress_stride=compress_stride,
+        k_value=k_value,
+        is_causal=is_causal,
+        mma_tiler_mn=mma_tiler_mn,
+        s_q_default_override=4096,
+        s_kv_default_override=128,
+    )
+
+    Q, K, _, LSE, _, _, cum_seqlen_q, cum_seqlen_kv, max_s_q, max_s_kv = allocate_input_tensors(cfg)
+    _, _, _, topk_scores, topk_indices = allocate_output_tensors(cfg)
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    topk_reduction = NSA.TopKReduction(
+        sample_q=Q,
+        sample_k=K,
+        sample_lse=LSE,
+        sample_topk_scores=topk_scores,
+        sample_topk_indices=topk_indices,
+        sample_cum_seqlen_q=cum_seqlen_q,
+        sample_cum_seqlen_k=cum_seqlen_kv,
+        max_s_q=max_s_q,
+        max_s_k=max_s_kv,
+        acc_dtype=cfg["acc_dtype"],
+        k_value=cfg["k_value"],
+        selection_block_size=cfg["selection_block_size"],
+        compress_stride=cfg["compress_stride"],
+        is_causal=cfg["is_causal"],
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        scale_softmax=None,
+    )
+    assert topk_reduction.check_support()
+    topk_reduction.compile(current_stream=stream)
+    topk_reduction.execute(
+        q_tensor=Q,
+        k_tensor=K,
+        lse_tensor=LSE,
+        topk_scores_tensor=topk_scores,
+        topk_indices_tensor=topk_indices,
+        cumulative_s_q_tensor=cum_seqlen_q,
+        cumulative_s_k_tensor=cum_seqlen_kv,
+        current_stream=stream,
+    )
+
+    print("No reference check for Top-K Reduction")
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_nsa_topk_reduction_params
+def test_nsa_topk_reduction_wrapper(
+    layout,
+    dtype,
+    acc_dtype,
+    selection_block_size,
+    compress_stride,
+    k_value,
+    is_causal,
+    mma_tiler_mn,
+    request,
+):
+    try:
+        from cudnn import NSA
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    cfg = nsa_init(
+        request=request,
+        layout=layout,
+        dtype=dtype,
+        acc_dtype=acc_dtype,
+        selection_block_size=selection_block_size,
+        compress_stride=compress_stride,
+        k_value=k_value,
+        is_causal=is_causal,
+        mma_tiler_mn=mma_tiler_mn,
+        s_q_default_override=4096,
+        s_kv_default_override=128,
+    )
+
+    Q, K, _, LSE, _, _, cum_seqlen_q, cum_seqlen_kv, max_s_q, max_s_kv = allocate_input_tensors(cfg)
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    topk_scores, topk_indices = NSA.topk_reduction_wrapper(
+        q_tensor=Q,
+        k_tensor=K,
+        lse_tensor=LSE,
+        cum_seqlen_q_tensor=cum_seqlen_q,
+        cum_seqlen_k_tensor=cum_seqlen_kv,
+        max_s_q=max_s_q,
+        max_s_k=max_s_kv,
+        acc_dtype=cfg["acc_dtype"],
+        k_value=cfg["k_value"],
+        selection_block_size=cfg["selection_block_size"],
+        compress_stride=cfg["compress_stride"],
+        is_causal=cfg["is_causal"],
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        scale_softmax=None,
+        current_stream=stream,
+    )
+
+    print("No reference check for Top-K Reduction")
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_fe_api_utils.py b/third_party/cudnn-frontend/test/python/fe_api/test_fe_api_utils.py
new file mode 100644
index 00000000..ed212792
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_fe_api_utils.py
@@ -0,0 +1,145 @@
+import torch
+import pytest
+from test_low_precision_matmul import (
+    _bfloat16_to_float4_e2m1fn_x2,
+    float4_e2m1fn_x2_to_float32,
+)
+
+try:
+    import cutlass.cute as cute
+    import cutlass
+
+    @cute.jit
+    def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+        sf_ref_tensor: cute.Tensor,
+        sf_mma_tensor: cute.Tensor,
+    ):
+        """Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
+        # sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
+        # group to ((32, 4, rest_m), (4, rest_k), l)
+
+        sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
+        sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
+        for i in cutlass.range(cute.size(sf_ref_tensor)):
+            mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
+            sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
+
+except Exception:
+    cute = None
+    cvt_sf_MKL_to_M32x4xrm_K4xrk_L = None
+
+
+def ceil_div(a: int, b: int) -> int:
+    """Compute ceiling division of a by b."""
+    return (a + b - 1) // b
+
+
+def create_and_permute_tensor(l, mode0, mode1, is_mode0_major, dtype):
+    # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l)
+    # else: (l, mode0, mode1) -> (mode0, mode1, l)
+    shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+    permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+    is_unsigned = dtype in {
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+    }  # uint8 is interpreted as fp4x2
+    min_val = 0 if is_unsigned else -2
+    max_val = 4 if is_unsigned else 2
+
+    dtype_tensor = None
+    ref_tensor = None
+
+    # Generate random values according to dtype support
+    if dtype in {torch.int8, torch.int16, torch.int32, torch.int64}:
+        ref_tensor = torch.randint(int(min_val), int(max_val), shape, dtype=torch.int32, device="cuda").permute(permute_order)
+        dtype_tensor = ref_tensor.to(dtype)
+    if dtype not in {torch.float4_e2m1fn_x2, torch.uint8}:
+        dtype_tensor = torch.empty(shape, dtype=torch.float32, device="cuda").uniform_(float(min_val), float(max_val)).permute(permute_order).to(dtype)
+        ref_tensor = dtype_tensor.to(torch.float32)
+    else:
+        dtype_tensor = _bfloat16_to_float4_e2m1fn_x2(
+            torch.empty(shape, dtype=torch.float32, device="cuda").uniform_(float(min_val), float(max_val)).to(torch.bfloat16)
+        )
+        ref_tensor = float4_e2m1fn_x2_to_float32(dtype_tensor).to(torch.float32).permute(permute_order)
+        dtype_tensor = dtype_tensor.permute(permute_order).view(dtype)
+
+    return ref_tensor, dtype_tensor
+
+
+def compute_reference_amax(output_tensor: torch.Tensor) -> float:
+    """
+    Compute reference amax value on CPU.
+
+    Args:
+        output_tensor: torch.Tensor, GEMM output result (CPU tensor)
+
+    Returns:
+        float: reference amax value
+    """
+    # Ensure FP32 for computation
+    if output_tensor.dtype != torch.float32:
+        output_fp32 = output_tensor.float()
+    else:
+        output_fp32 = output_tensor
+
+    # Compute absolute maximum value
+    reference_amax = torch.amax(torch.abs(output_fp32))
+
+    return reference_amax.item()
+
+
+def create_sf_layout_tensor(l, mn, nk, sf_vec_size):
+    sf_k = ceil_div(nk, sf_vec_size)
+
+    atom_m = (32, 4)
+    atom_k = 4
+    mma_shape = (
+        l,
+        ceil_div(mn, atom_m[0] * atom_m[1]),
+        ceil_div(sf_k, atom_k),
+        atom_m[0],
+        atom_m[1],
+        atom_k,
+    )
+
+    mma_permute_order = (3, 4, 1, 5, 2, 0)
+
+    # Create f32 cute torch tensor (cpu)
+    cute_f32_torch_tensor_cpu = torch.zeros(mma_shape, dtype=torch.float32).permute(mma_permute_order)
+
+    return cute_f32_torch_tensor_cpu, sf_k
+
+
+# Create scale factor tensor SFA/SFB
+def create_scale_factor_tensor(l, mn, k, sf_vec_size, dtype):
+    cute_f32_torch_tensor_cpu, sf_k = create_sf_layout_tensor(l, mn, k, sf_vec_size)
+    ref_shape = (l, mn, sf_k)
+    ref_permute_order = (1, 2, 0)
+
+    # Create f32 ref torch tensor (cpu)
+    ref_f32_torch_tensor_cpu = torch.empty(ref_shape, dtype=torch.float32).uniform_(1, 3).permute(ref_permute_order).to(torch.int8).to(torch.float32)
+
+    # convert ref f32 tensor to cute f32 tensor
+    try:
+        from cutlass.cute.runtime import from_dlpack
+
+        cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+            from_dlpack(ref_f32_torch_tensor_cpu),
+            from_dlpack(cute_f32_torch_tensor_cpu),
+        )
+    except Exception:
+        pytest.skip("CUTLASS is not installed; skipping tests due to scale factor tensor creation requiring CUTLASS.")
+
+    # reshape makes memory contiguous
+    ref_f32_torch_tensor_cpu = (
+        ref_f32_torch_tensor_cpu.permute(2, 0, 1).unsqueeze(-1).expand(l, mn, sf_k, sf_vec_size).reshape(l, mn, sf_k * sf_vec_size).permute(*ref_permute_order)
+    )
+    ref_f32_torch_tensor_cpu = ref_f32_torch_tensor_cpu[:, :k, :]
+
+    if dtype != torch.int8:
+        cute_torch_tensor = cute_f32_torch_tensor_cpu.to(dtype).cuda()
+    else:
+        cute_torch_tensor = cute_f32_torch_tensor_cpu.to(torch.float8_e8m0fnu).cuda().view(dtype)
+
+    return ref_f32_torch_tensor_cpu.cuda(), cute_torch_tensor
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_gemm_amax.py b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_amax.py
new file mode 100644
index 00000000..bdce9a1c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_amax.py
@@ -0,0 +1,297 @@
+import torch
+
+import pytest
+
+from test_utils import torch_fork_set_rng
+from fe_api.test_gemm_amax_utils import (
+    with_gemm_amax_params_fp4,
+    with_gemm_amax_params_fp8,
+)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_amax_params_fp4
+def test_gemm_amax_compile_execute_fp4(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    _test_gemm_amax_compile_execute(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_amax_params_fp8
+def test_gemm_amax_compile_execute_fp8(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    _test_gemm_amax_compile_execute(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_amax_params_fp4
+def test_gemm_amax_wrapper_fp4(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    _test_gemm_amax_wrapper(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_amax_params_fp8
+def test_gemm_amax_wrapper_fp8(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    _test_gemm_amax_wrapper(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        sf_vec_size=sf_vec_size,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        request=request,
+    )
+
+
+"""
+GemmAmax API with explicit set_params, compile, and execute paths. 
+Use this method when running one static configuration for each GemmAmax object.
+"""
+
+
+def _test_gemm_amax_compile_execute(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    try:
+        from cudnn import GemmAmaxSm100
+        from cuda.bindings import driver as cuda
+        from fe_api.test_gemm_amax_utils import (
+            allocate_input_tensors,
+            allocate_output_tensors,
+            check_ref_gemm_amax,
+            gemm_amax_init,
+        )
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = gemm_amax_init(
+        request,
+        a_major,
+        b_major,
+        c_major,
+        ab_dtype,
+        sf_dtype,
+        c_dtype,
+        acc_dtype,
+        sf_vec_size,
+        mma_tiler_mn,
+        cluster_shape_mn,
+    )
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    a_torch, a_ref, b_torch, b_ref, sfa_torch, sfa_ref, sfb_torch, sfb_ref = allocate_input_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["k"],
+        cfg["l"],
+        cfg["ab_dtype"],
+        cfg["sf_dtype"],
+        cfg["sf_vec_size"],
+        cfg["a_major"],
+        cfg["b_major"],
+    )
+    c_torch, amax_torch = allocate_output_tensors(cfg["m"], cfg["n"], cfg["l"], cfg["c_dtype"], cfg["c_major"])
+
+    gemm = GemmAmaxSm100(
+        sample_a=a_torch,
+        sample_b=b_torch,
+        sample_sfa=sfa_torch,
+        sample_sfb=sfb_torch,
+        sample_c=c_torch,
+        sample_amax=amax_torch,
+        acc_dtype=cfg["acc_dtype"],
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        cluster_shape_mn=cfg["cluster_shape_mn"],
+        sf_vec_size=cfg["sf_vec_size"],
+    )
+    try:
+        assert gemm.check_support(), "Unsupported testcase"
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+    gemm.compile(current_stream=stream)
+    gemm.execute(
+        a_tensor=a_torch,
+        b_tensor=b_torch,
+        sfa_tensor=sfa_torch,
+        sfb_tensor=sfb_torch,
+        c_tensor=c_torch,
+        amax_tensor=amax_torch,
+        current_stream=stream,
+    )
+
+    check_ref_gemm_amax(a_ref, b_ref, sfa_ref, sfb_ref, c_torch, amax_torch, skip_ref=cfg["skip_ref"])
+
+
+"""
+GemmAmax API with gemm_amax_wrapper:
+Use the wrapper to directly call GemmAmax without explicit setup and compilation.
+"""
+
+
+def _test_gemm_amax_wrapper(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    try:
+        from cudnn import gemm_amax_wrapper_sm100
+        from cuda.bindings import driver as cuda
+        from fe_api.test_gemm_amax_utils import (
+            allocate_input_tensors,
+            allocate_output_tensors,
+            check_ref_gemm_amax,
+            gemm_amax_init,
+        )
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = gemm_amax_init(
+        request,
+        a_major,
+        b_major,
+        c_major,
+        ab_dtype,
+        sf_dtype,
+        c_dtype,
+        acc_dtype,
+        sf_vec_size,
+        mma_tiler_mn,
+        cluster_shape_mn,
+    )
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    a_torch, a_ref, b_torch, b_ref, sfa_torch, sfa_ref, sfb_torch, sfb_ref = allocate_input_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["k"],
+        cfg["l"],
+        cfg["ab_dtype"],
+        cfg["sf_dtype"],
+        cfg["sf_vec_size"],
+        cfg["a_major"],
+        cfg["b_major"],
+    )
+
+    try:
+        for _ in range(2):  # Run twice to test caching path
+            c_torch, amax_torch = gemm_amax_wrapper_sm100(
+                a_tensor=a_torch,
+                b_tensor=b_torch,
+                sfa_tensor=sfa_torch,
+                sfb_tensor=sfb_torch,
+                c_major=cfg["c_major"],
+                c_dtype=cfg["c_dtype"],
+                acc_dtype=cfg["acc_dtype"],
+                mma_tiler_mn=cfg["mma_tiler_mn"],
+                cluster_shape_mn=cfg["cluster_shape_mn"],
+                sf_vec_size=cfg["sf_vec_size"],
+                stream=stream,
+            )
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+
+    check_ref_gemm_amax(a_ref, b_ref, sfa_ref, sfb_ref, c_torch, amax_torch, skip_ref=cfg["skip_ref"])
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_gemm_amax_utils.py b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_amax_utils.py
new file mode 100644
index 00000000..30ba91c0
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_amax_utils.py
@@ -0,0 +1,222 @@
+"""
+Utilities and parameterization for GEMM Amax tests.
+Contains test configuration fixtures, tensor creation, and reference implementations.
+"""
+
+import torch
+from cudnn.datatypes import _convert_to_cutlass_data_type
+import pytest
+from test_low_precision_matmul import (
+    _bfloat16_to_float4_e2m1fn_x2,
+    float4_e2m1fn_x2_to_float32,
+)
+from test_fe_api_utils import create_and_permute_tensor, create_scale_factor_tensor
+
+# Parameterization marks for GEMM Amax
+GEMM_AMAX_PARAM_MARKS_FP4 = [
+    pytest.mark.parametrize("a_major", ["k"]),
+    pytest.mark.parametrize("b_major", ["k"]),
+    pytest.mark.parametrize("c_major", ["m", "n"]),
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            torch.float4_e2m1fn_x2,
+            # torch.uint8,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "sf_dtype",
+        [
+            torch.float8_e8m0fnu,
+            # torch.int8,
+            torch.float8_e4m3fn,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            torch.float32,
+            # torch.float16,
+            torch.bfloat16,
+            # torch.float8_e5m2,
+            torch.float8_e4m3fn,
+            torch.float4_e2m1fn_x2,
+            # torch.uint8,
+        ],
+    ),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+    pytest.mark.parametrize("sf_vec_size", [16, 32]),
+    pytest.mark.parametrize(
+        "mma_tiler_mn",
+        [
+            (128, 128),
+        ],
+    ),
+    pytest.mark.parametrize("cluster_shape_mn", [(1, 1), (2, 2)]),
+]
+
+GEMM_AMAX_PARAM_MARKS_FP8 = [
+    pytest.mark.parametrize("a_major", ["k", "m"]),
+    pytest.mark.parametrize("b_major", ["k", "n"]),
+    pytest.mark.parametrize("c_major", ["m", "n"]),
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            torch.float8_e5m2,
+            # torch.float8_e4m3fn,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "sf_dtype",
+        [
+            torch.float8_e8m0fnu,
+            # torch.int8,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            torch.float32,
+            # torch.float16,
+            torch.bfloat16,
+        ],
+    ),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+    pytest.mark.parametrize("sf_vec_size", [32]),
+    pytest.mark.parametrize("mma_tiler_mn", [(128, 128), (128, 256)]),
+    pytest.mark.parametrize("cluster_shape_mn", [(1, 1), (2, 2)]),
+]
+
+
+def with_gemm_amax_params_fp4(func):
+    """Apply all GEMM Amax parameterization marks to a test function."""
+    for mark in reversed(GEMM_AMAX_PARAM_MARKS_FP4):
+        func = mark(func)
+    return func
+
+
+def with_gemm_amax_params_fp8(func):
+    """Apply all GEMM Amax parameterization marks to a test function."""
+    for mark in reversed(GEMM_AMAX_PARAM_MARKS_FP8):
+        func = mark(func)
+    return func
+
+
+def gemm_amax_init(
+    request,
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    sf_dtype,
+    c_dtype,
+    acc_dtype,
+    sf_vec_size,
+    mma_tiler_mn,
+    cluster_shape_mn,
+):
+    """Build test config, allowing CLI overrides for problem size/tiling/cluster/skip-ref."""
+    major, _ = torch.cuda.get_device_capability()
+    if major < 10:
+        pytest.skip(f"Environment not supported: requires compute capability >= 10, found {major}")
+
+    mnkl_str = request.config.getoption("--gemm-amax-mnkl", default=None)
+    mma_tiler_str = request.config.getoption("--gemm-amax-mma-tiler", default=None)
+    cluster_shape_str = request.config.getoption("--gemm-amax-cluster-shape", default=None)
+    skip_ref = request.config.getoption("--gemm-amax-skip-ref", default=False)
+
+    if mnkl_str is not None:
+        m, n, k, l = [int(x.strip()) for x in mnkl_str.split(",")]
+    else:
+        m, n, k, l = 512, 256, 256, 1
+
+    if mma_tiler_str is not None:
+        mma_tiler_mn = tuple(int(x.strip()) for x in mma_tiler_str.split(","))
+    if cluster_shape_str is not None:
+        cluster_shape_mn = tuple(int(x.strip()) for x in cluster_shape_str.split(","))
+
+    return {
+        "m": m,
+        "n": n,
+        "k": k,
+        "l": l,
+        "ab_dtype": ab_dtype,
+        "sf_dtype": sf_dtype,
+        "sf_vec_size": sf_vec_size,
+        "c_dtype": c_dtype,
+        "acc_dtype": acc_dtype,
+        "a_major": a_major,
+        "b_major": b_major,
+        "c_major": c_major,
+        "mma_tiler_mn": mma_tiler_mn,
+        "cluster_shape_mn": cluster_shape_mn,
+        "skip_ref": skip_ref,
+    }
+
+
+def allocate_input_tensors(m, n, k, l, ab_dtype, sf_dtype, sf_vec_size, a_major, b_major):
+    """Allocate and initialize input tensors for GEMM Amax tests."""
+    a_ref, a_tensor = create_and_permute_tensor(l, m, k, a_major == "m", ab_dtype)
+    b_ref, b_tensor = create_and_permute_tensor(l, n, k, b_major == "n", ab_dtype)
+    sfa_ref, sfa_tensor = create_scale_factor_tensor(l, m, k, sf_vec_size, sf_dtype)
+    sfb_ref, sfb_tensor = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+
+    return a_tensor, a_ref, b_tensor, b_ref, sfa_tensor, sfa_ref, sfb_tensor, sfb_ref
+
+
+def allocate_output_tensors(m, n, l, c_dtype, c_major):
+    """Allocate and initialize output tensors for GEMM Amax tests."""
+    _, c_tensor = create_and_permute_tensor(l, m, n, c_major == "m", c_dtype)
+    amax_tensor = torch.full((1, 1, 1), -float("inf"), device="cuda", dtype=torch.float32)
+    return c_tensor, amax_tensor
+
+
+def check_ref_gemm_amax(a, b, sfa_ref, sfb_ref, c, amax, skip_ref=False):
+    """Check GEMM Amax output against reference implementation."""
+    if skip_ref:
+        print("Skipping reference check")
+        return
+
+    a_ref = a.float().cpu()
+    b_ref = b.float().cpu()
+    sfa_ref = sfa_ref.float().cpu()
+    sfb_ref = sfb_ref.float().cpu()
+
+    res_a = torch.einsum("mkl,mkl->mkl", a_ref, sfa_ref)
+    res_b = torch.einsum("nkl,nkl->nkl", b_ref, sfb_ref)
+    c_ref = torch.einsum("mkl,nkl->mnl", res_a, res_b)
+    amax_ref = torch.amax(torch.abs(c_ref)).to(torch.float32).reshape(1, 1, 1)
+
+    # For FP8 outputs, use cute.testing.convert() to match kernel's conversion behavior
+    is_c_fp8 = c.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}
+    is_c_fp4 = c.dtype in {torch.float4_e2m1fn_x2, torch.uint8}
+
+    if is_c_fp8:
+        from cutlass.cute.runtime import from_dlpack
+        from cudnn.datatypes import _convert_to_cutlass_data_type
+        import cutlass.cute as cute
+
+        m, n, l = c_ref.shape
+        # Convert ref: f32 -> f8 -> f32 using CUTE's conversion
+        ref_f8_ = torch.empty(l, m, n, dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+        ref_f8 = from_dlpack(ref_f8_, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        ref_f8.element_type = _convert_to_cutlass_data_type(c.dtype)
+        ref_device = c_ref.permute(2, 0, 1).contiguous().permute(1, 2, 0).cuda()
+        ref_tensor = from_dlpack(ref_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        cute.testing.convert(ref_tensor, ref_f8)  # f32 -> f8
+        cute.testing.convert(ref_f8, ref_tensor)  # f8 -> f32
+        c_ref = ref_device.cpu()
+
+        torch.testing.assert_close(c_ref.to(torch.float32), c.cpu().to(torch.float32), atol=0.1, rtol=0.1)
+    elif is_c_fp4:
+        fp4_c_ref = _bfloat16_to_float4_e2m1fn_x2(c_ref.permute(2, 0, 1).to(torch.bfloat16))
+        c_ref = float4_e2m1fn_x2_to_float32(fp4_c_ref).to(torch.float32).permute(1, 2, 0)
+
+        c_f32 = float4_e2m1fn_x2_to_float32(c.cpu().permute(2, 0, 1).view(torch.float4_e2m1fn_x2)).to(torch.float32).permute(1, 2, 0)
+
+        torch.testing.assert_close(c_ref.to(torch.float32), c_f32.to(torch.float32), atol=0.1, rtol=0.1)
+    else:
+        c_ref = c_ref.to(c.dtype)
+        torch.testing.assert_close(c_ref, c.cpu(), atol=0.01, rtol=0.01)
+
+    torch.testing.assert_close(amax_ref, amax.cpu(), atol=1e-01, rtol=1e-01)
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_gemm_swiglu.py b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_swiglu.py
new file mode 100644
index 00000000..b877f592
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_swiglu.py
@@ -0,0 +1,548 @@
+import torch
+
+import pytest
+from test_utils import torch_fork_set_rng
+from fe_api.test_gemm_swiglu_utils import (
+    allocate_input_tensors,
+    allocate_output_tensors,
+    check_ref_gemm_swiglu,
+    with_gemm_swiglu_params,
+    gemm_swiglu_init,
+    with_gemm_swiglu_quant_params_fp4,
+    with_gemm_swiglu_quant_params_fp8,
+    check_ref_gemm_swiglu_quant,
+)
+
+"""
+GemmSwiglu API with explicit set_params, compile, and execute paths. 
+Use this method when running one static configuration for each GemmSwiglu object.
+"""
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_swiglu_params
+def test_gemm_swiglu_compile_execute(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    acc_dtype,
+    c_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    try:
+        from cudnn import GemmSwigluSm100
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = gemm_swiglu_init(
+        request,
+        a_major,
+        b_major,
+        c_major,
+        ab_dtype,
+        ab12_dtype,
+        acc_dtype,
+        c_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    a_torch, _, b_torch, _, _, _, _, _, _ = allocate_input_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["k"],
+        cfg["l"],
+        cfg["ab_dtype"],
+        cfg["a_major"],
+        cfg["b_major"],
+    )
+    ab12_torch, c_torch, _, _, _ = allocate_output_tensors(cfg["m"], cfg["n"], cfg["l"], cfg["ab12_dtype"], cfg["c_dtype"], cfg["c_major"])
+
+    gemm_swiglu = GemmSwigluSm100(
+        sample_a=a_torch,
+        sample_b=b_torch,
+        sample_ab12=ab12_torch,
+        sample_c=c_torch,
+        alpha=cfg["alpha"],
+        acc_dtype=cfg["acc_dtype"],
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        cluster_shape_mn=cfg["cluster_shape_mn"],
+    )
+    try:
+        assert gemm_swiglu.check_support(), "Unsupported testcase"
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+    gemm_swiglu.compile(current_stream=stream)
+    gemm_swiglu.execute(
+        a_tensor=a_torch,
+        b_tensor=b_torch,
+        ab12_tensor=ab12_torch,
+        c_tensor=c_torch,
+        alpha=cfg["alpha"],
+        current_stream=stream,
+    )
+
+    check_ref_gemm_swiglu(
+        a_torch,
+        b_torch,
+        ab12_torch,
+        c_torch,
+        alpha=cfg["alpha"],
+        skip_ref=cfg["skip_ref"],
+    )
+
+
+"""
+GemmSwiglu API with gemm_swiglu_wrapper:
+Use the wrapper to directly call GemmSwiglu without explicit setup and compilation.
+"""
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_swiglu_params
+def test_gemm_swiglu_wrapper(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    acc_dtype,
+    c_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    request,
+):
+    try:
+        from cudnn import gemm_swiglu_wrapper_sm100
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        print(f"ImportError: {e}")
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = gemm_swiglu_init(
+        request,
+        a_major,
+        b_major,
+        c_major,
+        ab_dtype,
+        ab12_dtype,
+        acc_dtype,
+        c_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    a_torch, _, b_torch, _, _, _, _, _, _ = allocate_input_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["k"],
+        cfg["l"],
+        cfg["ab_dtype"],
+        cfg["a_major"],
+        cfg["b_major"],
+    )
+
+    try:
+        for _ in range(2):  # Run twice to test caching path
+            ab12_torch, c_torch = gemm_swiglu_wrapper_sm100(
+                a_tensor=a_torch,
+                b_tensor=b_torch,
+                alpha=cfg["alpha"],
+                c_major=cfg["c_major"],
+                ab12_dtype=cfg["ab12_dtype"],
+                c_dtype=cfg["c_dtype"],
+                acc_dtype=cfg["acc_dtype"],
+                mma_tiler_mn=cfg["mma_tiler_mn"],
+                cluster_shape_mn=cfg["cluster_shape_mn"],
+                stream=stream,
+            )
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+
+    check_ref_gemm_swiglu(
+        a_torch,
+        b_torch,
+        ab12_torch,
+        c_torch,
+        alpha=cfg["alpha"],
+        skip_ref=cfg["skip_ref"],
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_swiglu_quant_params_fp4
+def test_gemm_swiglu_compile_execute_quant_fp4(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    request,
+):
+    _test_gemm_swiglu_compile_execute_quant(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        ab12_dtype=ab12_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_swiglu_quant_params_fp8
+def test_gemm_swiglu_compile_execute_quant_fp8(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    request,
+):
+    _test_gemm_swiglu_compile_execute_quant(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        ab12_dtype=ab12_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_swiglu_quant_params_fp4
+def test_gemm_swiglu_wrapper_quant_fp4(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    request,
+):
+    _test_gemm_swiglu_wrapper_quant(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        ab12_dtype=ab12_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_gemm_swiglu_quant_params_fp8
+def test_gemm_swiglu_wrapper_quant_fp8(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    request,
+):
+    _test_gemm_swiglu_wrapper_quant(
+        a_major=a_major,
+        b_major=b_major,
+        c_major=c_major,
+        ab_dtype=ab_dtype,
+        ab12_dtype=ab12_dtype,
+        c_dtype=c_dtype,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        request=request,
+    )
+
+
+def _test_gemm_swiglu_compile_execute_quant(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    request,
+):
+    try:
+        from cudnn import GemmSwigluSm100
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = gemm_swiglu_init(
+        request,
+        a_major,
+        b_major,
+        c_major,
+        ab_dtype,
+        ab12_dtype,
+        acc_dtype,
+        c_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    (
+        a_torch,
+        a_ref,
+        b_torch,
+        b_ref,
+        sfa_tensor,
+        sfa_ref,
+        sfb_tensor,
+        sfb_ref,
+        norm_const_tensor,
+    ) = allocate_input_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["k"],
+        cfg["l"],
+        cfg["ab_dtype"],
+        cfg["a_major"],
+        cfg["b_major"],
+        is_block_scaled=True,
+        sf_vec_size=cfg["sf_vec_size"],
+        sf_dtype=cfg["sf_dtype"],
+        c_dtype=cfg["c_dtype"],
+        norm_const=1.0,
+    )
+
+    ab12_torch, c_torch, sfc_tensor, sfc_ref, amax_tensor = allocate_output_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["l"],
+        cfg["ab12_dtype"],
+        cfg["c_dtype"],
+        cfg["c_major"],
+        is_block_scaled=True,
+        sf_vec_size=cfg["sf_vec_size"],
+        sf_dtype=cfg["sf_dtype"],
+    )
+
+    gemm_swiglu = GemmSwigluSm100(
+        sample_a=a_torch,
+        sample_b=b_torch,
+        sample_ab12=ab12_torch,
+        sample_c=c_torch,
+        alpha=cfg["alpha"],
+        acc_dtype=cfg["acc_dtype"],
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        cluster_shape_mn=cfg["cluster_shape_mn"],
+        sample_sfa=sfa_tensor,
+        sample_sfb=sfb_tensor,
+        sample_amax=amax_tensor,
+        sample_sfc=sfc_tensor,
+        sample_norm_const=norm_const_tensor,
+        sf_vec_size=cfg["sf_vec_size"],
+        vector_f32=cfg["vector_f32"],
+        ab12_stages=4,
+    )
+    try:
+        assert gemm_swiglu.check_support(), "Unsupported testcase"
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+    gemm_swiglu.compile(current_stream=stream)
+    gemm_swiglu.execute(
+        a_tensor=a_torch,
+        b_tensor=b_torch,
+        ab12_tensor=ab12_torch,
+        c_tensor=c_torch,
+        sfa_tensor=sfa_tensor,
+        sfb_tensor=sfb_tensor,
+        amax_tensor=amax_tensor,
+        sfc_tensor=sfc_tensor,
+        norm_const_tensor=norm_const_tensor,
+        alpha=cfg["alpha"],
+        current_stream=stream,
+    )
+
+    check_ref_gemm_swiglu_quant(
+        a_torch,
+        a_ref,
+        b_torch,
+        b_ref,
+        sfa_ref,
+        sfb_ref,
+        ab12_torch,
+        c_torch,
+        sfc_tensor,
+        amax_tensor,
+        norm_const_tensor,
+        cfg["sf_vec_size"],
+        alpha=cfg["alpha"],
+        skip_ref=cfg["skip_ref"],
+    )
+
+
+def _test_gemm_swiglu_wrapper_quant(
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    request,
+):
+    try:
+        from cudnn import gemm_swiglu_wrapper_sm100
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+    cfg = gemm_swiglu_init(
+        request,
+        a_major,
+        b_major,
+        c_major,
+        ab_dtype,
+        ab12_dtype,
+        acc_dtype,
+        c_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    (
+        a_torch,
+        a_ref,
+        b_torch,
+        b_ref,
+        sfa_tensor,
+        sfa_ref,
+        sfb_tensor,
+        sfb_ref,
+        norm_const_tensor,
+    ) = allocate_input_tensors(
+        cfg["m"],
+        cfg["n"],
+        cfg["k"],
+        cfg["l"],
+        cfg["ab_dtype"],
+        cfg["a_major"],
+        cfg["b_major"],
+        is_block_scaled=True,
+        sf_vec_size=cfg["sf_vec_size"],
+        sf_dtype=cfg["sf_dtype"],
+        c_dtype=cfg["c_dtype"],
+        norm_const=1.0,
+    )
+
+    try:
+        for _ in range(2):  # Run twice to test caching path
+            ab12_torch, c_torch, sfc_tensor, amax_tensor = gemm_swiglu_wrapper_sm100(
+                a_tensor=a_torch,
+                b_tensor=b_torch,
+                alpha=cfg["alpha"],
+                c_major=cfg["c_major"],
+                ab12_dtype=cfg["ab12_dtype"],
+                c_dtype=cfg["c_dtype"],
+                acc_dtype=cfg["acc_dtype"],
+                mma_tiler_mn=cfg["mma_tiler_mn"],
+                cluster_shape_mn=cfg["cluster_shape_mn"],
+                sfa_tensor=sfa_tensor,
+                sfb_tensor=sfb_tensor,
+                norm_const_tensor=norm_const_tensor,
+                sf_vec_size=cfg["sf_vec_size"],
+                vector_f32=cfg["vector_f32"],
+                ab12_stages=4,
+                stream=stream,
+            )
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+
+    check_ref_gemm_swiglu_quant(
+        a_torch,
+        a_ref,
+        b_torch,
+        b_ref,
+        sfa_ref,
+        sfb_ref,
+        ab12_torch,
+        c_torch,
+        sfc_tensor,
+        amax_tensor,
+        norm_const_tensor,
+        cfg["sf_vec_size"],
+        alpha=cfg["alpha"],
+        skip_ref=cfg["skip_ref"],
+    )
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_gemm_swiglu_utils.py b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_swiglu_utils.py
new file mode 100644
index 00000000..32aec9e4
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_gemm_swiglu_utils.py
@@ -0,0 +1,571 @@
+"""
+Utilities and parameterization for GEMM SwiGLU tests.
+Contains test configuration fixtures, tensor creation, and reference implementations.
+"""
+
+import torch
+import pytest
+from typing import Optional, Tuple
+from test_fe_api_utils import (
+    compute_reference_amax,
+    create_and_permute_tensor,
+    create_scale_factor_tensor,
+    create_sf_layout_tensor,
+    cvt_sf_MKL_to_M32x4xrm_K4xrk_L,
+)
+from test_low_precision_matmul import (
+    _bfloat16_to_float4_e2m1fn_x2,
+    float4_e2m1fn_x2_to_float32,
+)
+
+GEMM_SWIGLU_PARAM_MARKS = [
+    pytest.mark.parametrize("a_major", ["k", "m"]),
+    pytest.mark.parametrize("b_major", ["k", "n"]),
+    pytest.mark.parametrize("c_major", ["m", "n"]),
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            # torch.float16,
+            torch.bfloat16,
+            torch.float32,
+            torch.float8_e4m3fn,
+            # torch.float8_e5m2,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "ab12_dtype",
+        [
+            # torch.float16,
+            torch.bfloat16,
+            torch.float32,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "acc_dtype",
+        [
+            torch.float32,
+            # torch.float16,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            # torch.float16,
+            torch.bfloat16
+        ],
+    ),
+    pytest.mark.parametrize(
+        "mma_tiler_mn",
+        [
+            (128, 128),
+            (256, 256),
+            # (128, 64),
+            # (256, 128),
+        ],
+    ),
+    pytest.mark.parametrize(
+        "cluster_shape_mn",
+        [
+            (1, 1),
+            (2, 2),
+            # (4, 4),
+        ],
+    ),
+]
+
+
+def with_gemm_swiglu_params(func):
+    for mark in reversed(GEMM_SWIGLU_PARAM_MARKS):
+        func = mark(func)
+    return func
+
+
+GEMM_SWIGLU_QUANT_PARAM_MARKS_FP4 = [
+    pytest.mark.parametrize("a_major", ["k"]),
+    pytest.mark.parametrize("b_major", ["k"]),
+    pytest.mark.parametrize("c_major", ["n"]),
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            torch.float4_e2m1fn_x2,
+            # torch.uint8,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "ab12_dtype",
+        [
+            torch.float32,
+            # torch.float16,
+            torch.bfloat16,
+            torch.float8_e4m3fn,
+            # torch.float8_e5m2,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            torch.float32,
+            # torch.float16,
+            torch.bfloat16,
+        ],
+    ),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+    pytest.mark.parametrize(
+        "mma_tiler_mn",
+        [
+            (128, 128),
+            (256, 256),
+            # (128, 64),
+            # (256, 128),
+        ],
+    ),
+    pytest.mark.parametrize(
+        "cluster_shape_mn",
+        [
+            (1, 1),
+            (2, 2),
+            # (4, 4),
+        ],
+    ),
+    pytest.mark.parametrize("sf_vec_size", [16, 32]),
+    pytest.mark.parametrize("sf_dtype", [torch.float8_e8m0fnu, torch.float8_e4m3fn]),
+    pytest.mark.parametrize("vector_f32", [True, False]),
+]
+
+GEMM_SWIGLU_QUANT_PARAM_MARKS_FP8 = [
+    pytest.mark.parametrize("a_major", ["k", "m"]),
+    pytest.mark.parametrize("b_major", ["k", "n"]),
+    pytest.mark.parametrize("c_major", ["m", "n"]),
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            torch.float8_e4m3fn,
+            # torch.float8_e5m2,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "ab12_dtype",
+        [
+            # torch.float16,
+            torch.bfloat16,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            torch.float32,
+            # torch.float16,
+            torch.bfloat16,
+        ],
+    ),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+    pytest.mark.parametrize(
+        "mma_tiler_mn",
+        [
+            (128, 128),
+            (256, 256),
+            # (128, 64),
+            # (256, 128),
+        ],
+    ),
+    pytest.mark.parametrize(
+        "cluster_shape_mn",
+        [
+            (1, 1),
+            (2, 2),
+            # (4, 4),
+        ],
+    ),
+    pytest.mark.parametrize("sf_vec_size", [32]),
+    pytest.mark.parametrize("sf_dtype", [torch.float8_e8m0fnu]),
+    pytest.mark.parametrize("vector_f32", [True, False]),
+]
+
+
+def with_gemm_swiglu_quant_params_fp4(func):
+    for mark in reversed(GEMM_SWIGLU_QUANT_PARAM_MARKS_FP4):
+        func = mark(func)
+    return func
+
+
+def with_gemm_swiglu_quant_params_fp8(func):
+    for mark in reversed(GEMM_SWIGLU_QUANT_PARAM_MARKS_FP8):
+        func = mark(func)
+    return func
+
+
+def gemm_swiglu_init(
+    request,
+    a_major,
+    b_major,
+    c_major,
+    ab_dtype,
+    ab12_dtype,
+    acc_dtype,
+    c_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    ### Quantize only arguments
+    sf_vec_size: Optional[int] = None,
+    sf_dtype: Optional[torch.dtype] = None,
+    vector_f32: Optional[bool] = None,
+):
+    """Initialize configuration for GEMM SwiGLU tests."""
+    major, _ = torch.cuda.get_device_capability()
+    if major < 10:
+        pytest.skip(f"Environment not supported: requires compute capability >= 10, found {major}")
+
+    mnkl_str = request.config.getoption("--gemm-swiglu-mnkl", default=None)
+    mma_tiler_str = request.config.getoption("--gemm-swiglu-mma-tiler", default=None)
+    cluster_shape_str = request.config.getoption("--gemm-swiglu-cluster-shape", default=None)
+    alpha_opt = request.config.getoption("--gemm-swiglu-alpha", default=None)
+    skip_ref = request.config.getoption("--gemm-swiglu-skip-ref", default=False)
+
+    if mnkl_str is not None:
+        m, n, k, l = [int(x.strip()) for x in mnkl_str.split(",")]
+    else:
+        m, n, k, l = 256, 256, 512, 2
+
+    if mma_tiler_str is not None:
+        mma_tiler_mn = tuple(int(x.strip()) for x in mma_tiler_str.split(","))
+    if cluster_shape_str is not None:
+        cluster_shape_mn = tuple(int(x.strip()) for x in cluster_shape_str.split(","))
+
+    alpha = float(alpha_opt) if alpha_opt is not None else 1.0
+
+    config = {
+        "m": m,
+        "n": n,
+        "k": k,
+        "l": l,
+        "mma_tiler_mn": mma_tiler_mn,
+        "cluster_shape_mn": cluster_shape_mn,
+        "alpha": alpha,
+        "skip_ref": skip_ref,
+        "a_major": a_major,
+        "b_major": b_major,
+        "c_major": c_major,
+        "ab_dtype": ab_dtype,
+        "ab12_dtype": ab12_dtype,
+        "acc_dtype": acc_dtype,
+        "c_dtype": c_dtype,
+    }
+
+    # Add quantization parameters if provided
+    if sf_vec_size is not None:
+        config["sf_vec_size"] = sf_vec_size
+    if sf_dtype is not None:
+        config["sf_dtype"] = sf_dtype
+    if vector_f32 is not None:
+        config["vector_f32"] = vector_f32
+
+    return config
+
+
+def get_dtype_rcp_limits(dtype: torch.dtype) -> float:
+    if dtype == torch.float8_e5m2:
+        return 1 / 128.0
+    elif dtype == torch.float8_e4m3fn:
+        return 1 / 448.0
+    elif dtype in {torch.float4_e2m1fn_x2, torch.uint8}:
+        return 1 / 6.0
+    return 1.0
+
+
+def run_gemm_swiglu_quant_ref(
+    a_ref,
+    b_ref,
+    sfa_ref,
+    sfb_ref,
+    norm_const_ref,
+    alpha,
+    ab_dtype,
+    ab12_dtype,
+    c_dtype,
+    sfc_dtype,
+    sf_vec_size,
+):
+    res_a = torch.einsum("mkl,mkl->mkl", a_ref, sfa_ref)
+    res_b = torch.einsum("nkl,nkl->nkl", b_ref, sfb_ref)
+    ab12_ref = alpha * torch.einsum("mkl,nkl->mnl", res_a, res_b)
+    ab12_ref_ret = ab12_ref.to(ab12_dtype).to(torch.float32)
+
+    group = 32
+    n = b_ref.shape[0]
+    assert n % group == 0, "N must be divisible by 32 for GLU block grouping"
+    num_blocks = n // group
+    assert num_blocks % 2 == 0, "Number of 32-col blocks must be even (pairs of input/gate)"
+
+    cols = torch.arange(n, device=ab12_ref.device, dtype=torch.long)
+    block_cols = cols.view(num_blocks, group)
+    input_idx = block_cols[0::2].reshape(-1)
+    gate_idx = block_cols[1::2].reshape(-1)
+    ref_input = ab12_ref.index_select(1, input_idx)
+    ref_gate = ab12_ref.index_select(1, gate_idx)
+    c_ref = ref_input * (ref_gate * torch.sigmoid(ref_gate))
+
+    amax_ref = None
+    sfc_ref = None
+    if c_dtype == torch.bfloat16 and ab_dtype in {torch.float4_e2m1fn_x2, torch.uint8}:
+        amax_ref = torch.tensor(compute_reference_amax(c_ref.clone()))
+    elif c_dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+        try:
+            from cutlass.cute.runtime import from_dlpack
+            import cutlass.cute as cute
+            from cudnn.datatypes import _convert_to_cutlass_data_type
+        except ImportError:
+            pytest.skip("CUTLASS not available for scale factor conversion")
+
+        sfn = (n // 2 + sf_vec_size - 1) // sf_vec_size
+        m = a_ref.shape[0]
+        sfm = (m + 128 - 1) // 128 * 128
+        l = c_ref.shape[2]
+        ref_for_sf = c_ref.permute(2, 0, 1).contiguous()
+        ref_for_sf = ref_for_sf.view(l, sfm, sfn, sf_vec_size)
+        ref_for_sf, _ = torch.abs(ref_for_sf).max(dim=3)
+        ref_sfc_f32 = ref_for_sf * norm_const_ref * get_dtype_rcp_limits(c_dtype)
+        ref_sfc_f32 = ref_sfc_f32.permute(1, 2, 0)
+
+        # For some reason, using `ref_sfc_32_torch = ref_sfc_f32.to(sfc_dtype).to(torch.float32)` leads to different/incorrect results
+        ref_sfc_f8_torch = torch.empty((l, sfm, sfn), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+        ref_sfc_f8 = from_dlpack(ref_sfc_f8_torch, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        ref_sfc_f8.element_type = _convert_to_cutlass_data_type(sfc_dtype)
+        ref_sfc_f32_device = ref_sfc_f32.cuda()
+        ref_sfc_f32_tensor = from_dlpack(ref_sfc_f32_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        cute.testing.convert(ref_sfc_f32_tensor, ref_sfc_f8)
+        cute.testing.convert(ref_sfc_f8, ref_sfc_f32_tensor)
+        ref_sfc_32 = ref_sfc_f32_device.cpu()
+
+        ref_sfc_f32_cute_torch_tensor_cpu, _ = create_sf_layout_tensor(l, sfm, n // 2, sf_vec_size)
+        cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+            from_dlpack(ref_sfc_32),
+            from_dlpack(ref_sfc_f32_cute_torch_tensor_cpu),
+        )
+        sfc_ref = ref_sfc_f32_cute_torch_tensor_cpu.clone()
+
+        ref_sfc_rcp = norm_const_ref * ref_sfc_32.reciprocal()
+        ref_sfc_rcp_expanded = ref_sfc_rcp.unsqueeze(2).expand(sfm, sfn, sf_vec_size, l)
+        ref_sfc_rcp_expanded = ref_sfc_rcp_expanded.reshape(sfm, sfn * sf_vec_size, l)
+        ref_sfc_rcp_expanded = ref_sfc_rcp_expanded[:, : n // 2, :]
+        c_ref = torch.einsum("mnl,mnl->mnl", c_ref, ref_sfc_rcp_expanded)
+        c_ref = c_ref.to(c_dtype).to(torch.float32)
+
+    return ab12_ref_ret, c_ref, sfc_ref, amax_ref
+
+
+def run_gemm_swiglu_ref(a_ref, b_ref, alpha):
+    ab12_ref, c_ref = None, None
+    if a_ref.dtype in {torch.int8, torch.uint8, torch.float8_e4m3fn, torch.float8_e5m2}:
+        ab12_ref = alpha * torch.einsum("mkl,nkl->mnl", (a_ref).cpu(), (b_ref).cpu())
+    else:
+        ab12_ref = (alpha * torch.einsum("mkl,nkl->mnl", (a_ref), (b_ref))).cpu()
+
+    group = 32
+    n = b_ref.shape[0]
+    assert n % group == 0, "N must be divisible by 32 for GLU block grouping"
+    num_blocks = n // group
+    assert num_blocks % 2 == 0, "Number of 32-col blocks must be even (pairs of input/gate)"
+
+    cols = torch.arange(n, device=ab12_ref.device, dtype=torch.long)
+    block_cols = cols.view(num_blocks, group)
+    input_idx = block_cols[0::2].reshape(-1)
+    gate_idx = block_cols[1::2].reshape(-1)
+    c_ref = ab12_ref.index_select(1, input_idx) * (ab12_ref.index_select(1, gate_idx) * torch.sigmoid(ab12_ref.index_select(1, gate_idx)))
+    c_ref = c_ref.to(torch.float32)
+
+    return ab12_ref, c_ref
+
+
+def check_ref_gemm_swiglu(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    ab12: torch.Tensor,
+    c: torch.Tensor,
+    alpha: float = 1.0,
+    skip_ref: bool = False,
+):
+    if not skip_ref:
+        a_ref = a.clone().to(torch.float32)
+        b_ref = b.clone().to(torch.float32)
+        ab12_ref, c_ref = run_gemm_swiglu_ref(a_ref, b_ref, alpha)
+
+        is_ab12_fp8 = ab12.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}
+        if is_ab12_fp8:
+            torch.testing.assert_close(
+                ab12.cpu().to(torch.float32),
+                ab12_ref.to(torch.float32),
+                atol=0.1,
+                rtol=0.1,
+            )
+        else:
+            torch.testing.assert_close(ab12.cpu(), ab12_ref.to(ab12.dtype), atol=0.01, rtol=9e-03)
+
+        is_c_fp8 = c.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}
+        if is_c_fp8:
+            torch.testing.assert_close(
+                c.cpu().to(torch.float32),
+                c_ref.to(torch.float32),
+                atol=0.1,
+                rtol=0.1,
+            )
+        else:
+            torch.testing.assert_close(c.cpu(), c_ref.to(c.dtype), atol=0.01, rtol=9e-03)
+    else:
+        print("Skipping reference check")
+
+
+def check_ref_gemm_swiglu_quant(
+    a: torch.Tensor,
+    a_ref: torch.Tensor,
+    b: torch.Tensor,
+    b_ref: torch.Tensor,
+    sfa_ref: torch.Tensor,
+    sfb_ref: torch.Tensor,
+    ab12: torch.Tensor,
+    c: torch.Tensor,
+    sfc: Optional[torch.Tensor],
+    amax: Optional[torch.Tensor],
+    norm_const_ref: Optional[torch.Tensor],
+    sf_vec_size: int = 16,
+    alpha: float = 1.0,
+    skip_ref: bool = False,
+):
+    if skip_ref:
+        print("Skipping reference check")
+        return
+
+    ab_dtype = a.dtype
+    c_dtype = c.dtype
+    ab12_dtype = ab12.dtype
+    a_ref = a_ref.clone().to(torch.float32).cpu()
+    b_ref = b_ref.clone().to(torch.float32).cpu()
+    sfa_ref = sfa_ref.float().cpu()
+    sfb_ref = sfb_ref.float().cpu()
+    norm_const_ref = norm_const_ref.float().cpu() if norm_const_ref is not None else None
+    sfc_dtype = sfc.dtype if sfc is not None else None
+    ab12_ref, c_ref, sfc_ref, amax_ref = run_gemm_swiglu_quant_ref(
+        a_ref,
+        b_ref,
+        sfa_ref,
+        sfb_ref,
+        norm_const_ref,
+        alpha,
+        ab_dtype,
+        ab12_dtype,
+        c_dtype,
+        sfc_dtype,
+        sf_vec_size,
+    )
+
+    if ab12.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}:
+        torch.testing.assert_close(
+            ab12.cpu().to(torch.float32),
+            ab12_ref.to(torch.float32),
+            atol=0.01,
+            rtol=0.01,
+        )
+    else:
+        torch.testing.assert_close(ab12.cpu(), ab12_ref.to(ab12.dtype), atol=0.01, rtol=0.01)
+
+    if c_dtype in {torch.float32, torch.float16, torch.bfloat16}:
+        torch.testing.assert_close(c.cpu(), c_ref.to(c.dtype), atol=0.01, rtol=0.01)
+        if c_dtype == torch.bfloat16 and ab_dtype in {
+            torch.float4_e2m1fn_x2,
+            torch.uint8,
+        }:
+            reference_amax = torch.tensor(compute_reference_amax(c_ref.clone()))
+            torch.testing.assert_close(amax.cpu().squeeze(), reference_amax, atol=0.01, rtol=0.01)
+    elif c_dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+        torch.testing.assert_close(
+            sfc.cpu().to(torch.float32),
+            sfc_ref.to(torch.float32),
+            atol=0.01,
+            rtol=0.01,
+        )
+        torch.testing.assert_close(c.cpu().to(torch.float32), c_ref.to(torch.float32), atol=0.01, rtol=0.01)
+
+
+def allocate_input_tensors(
+    m: int,
+    n: int,
+    k: int,
+    l: int,
+    ab_dtype: torch.dtype,
+    a_major: str,
+    b_major: str,
+    is_block_scaled: bool = False,
+    ### block scaled only params
+    sf_vec_size: Optional[int] = None,
+    sf_dtype: Optional[torch.dtype] = None,
+    c_dtype: Optional[torch.dtype] = None,
+    norm_const: float = 1.0,
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]:
+    a_ref, a_tensor = create_and_permute_tensor(l, m, k, a_major == "m", ab_dtype)
+    b_ref, b_tensor = create_and_permute_tensor(l, n, k, b_major == "n", ab_dtype)
+    ### Block scaled-only params
+    sfa_ref, sfa_tensor, sfb_ref, sfb_tensor, norm_const_tensor = (
+        None,
+        None,
+        None,
+        None,
+        None,
+    )
+    if is_block_scaled:
+        sfa_ref, sfa_tensor = create_scale_factor_tensor(l, m, k, sf_vec_size, sf_dtype)
+        sfb_ref, sfb_tensor = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+        if c_dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+            norm_const_tensor = torch.tensor([norm_const], dtype=torch.float32).cuda()
+
+    return (
+        a_tensor,
+        a_ref,
+        b_tensor,
+        b_ref,
+        sfa_tensor,
+        sfa_ref,
+        sfb_tensor,
+        sfb_ref,
+        norm_const_tensor,
+    )
+
+
+def allocate_output_tensors(
+    m: int,
+    n: int,
+    l: int,
+    ab12_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    c_major: str,
+    is_block_scaled: bool = False,
+    ### block scaled only params
+    sf_vec_size: Optional[int] = None,
+    sf_dtype: Optional[torch.dtype] = None,
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+    Optional[torch.Tensor],
+]:
+    _, ab12_tensor = create_and_permute_tensor(l, m, n, c_major == "m", ab12_dtype)
+    _, c_tensor = create_and_permute_tensor(l, m, n // 2, c_major == "m", c_dtype)
+
+    ### Block scaled-only params
+    sfc_ref, sfc_tensor, amax_tensor = None, None, None
+    if is_block_scaled:
+        if c_dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
+            sfc_ref, sfc_tensor = create_scale_factor_tensor(l, m, n // 2, sf_vec_size, sf_dtype)
+        if c_dtype == torch.bfloat16:
+            amax_tensor = torch.full((1, 1, 1), -float("inf"), device="cuda", dtype=torch.float32)
+
+    return ab12_tensor, c_tensor, sfc_tensor, sfc_ref, amax_tensor
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_grouped_gemm_swiglu.py b/third_party/cudnn-frontend/test/python/fe_api/test_grouped_gemm_swiglu.py
new file mode 100644
index 00000000..3738dd1d
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_grouped_gemm_swiglu.py
@@ -0,0 +1,371 @@
+"""
+Tests for Grouped GEMM SwiGLU Forward Kernel (SM100+)
+
+This module tests the contiguous grouped block-scaled GEMM with SwiGLU activation
+for MoE (Mixture of Experts) workloads.
+
+Reference: continugous_blockscaled_grouped_gemm_swiglu_quant_fusion.py
+"""
+
+import torch
+import pytest
+from test_utils import torch_fork_set_rng
+from fe_api.test_grouped_gemm_swiglu_utils import (
+    grouped_gemm_swiglu_init,
+    with_grouped_gemm_swiglu_params_fp4,
+    with_grouped_gemm_swiglu_params_fp8,
+    allocate_grouped_gemm_input_tensors,
+    allocate_grouped_gemm_output_tensors,
+    check_ref_grouped_gemm_swiglu,
+)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_grouped_gemm_swiglu_params_fp4
+def test_grouped_gemm_swiglu_compile_execute_fp4(
+    ab_dtype,
+    c_dtype,
+    d_dtype,
+    cd_major,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    discrete_col_sfd,
+    request,
+):
+    _test_grouped_gemm_swiglu_compile_execute(
+        ab_dtype=ab_dtype,
+        c_dtype=c_dtype,
+        d_dtype=d_dtype,
+        cd_major=cd_major,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        discrete_col_sfd=discrete_col_sfd,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_grouped_gemm_swiglu_params_fp8
+def test_grouped_gemm_swiglu_compile_execute_fp8(
+    ab_dtype,
+    c_dtype,
+    d_dtype,
+    cd_major,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    discrete_col_sfd,
+    request,
+):
+    _test_grouped_gemm_swiglu_compile_execute(
+        ab_dtype=ab_dtype,
+        c_dtype=c_dtype,
+        d_dtype=d_dtype,
+        cd_major=cd_major,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        discrete_col_sfd=discrete_col_sfd,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_grouped_gemm_swiglu_params_fp4
+def test_grouped_gemm_swiglu_wrapper_fp4(
+    ab_dtype,
+    c_dtype,
+    d_dtype,
+    cd_major,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    discrete_col_sfd,
+    request,
+):
+    _test_grouped_gemm_swiglu_wrapper(
+        ab_dtype=ab_dtype,
+        c_dtype=c_dtype,
+        d_dtype=d_dtype,
+        cd_major=cd_major,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        discrete_col_sfd=discrete_col_sfd,
+        request=request,
+    )
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+@with_grouped_gemm_swiglu_params_fp8
+def test_grouped_gemm_swiglu_wrapper_fp8(
+    ab_dtype,
+    c_dtype,
+    d_dtype,
+    cd_major,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    discrete_col_sfd,
+    request,
+):
+    _test_grouped_gemm_swiglu_wrapper(
+        ab_dtype=ab_dtype,
+        c_dtype=c_dtype,
+        d_dtype=d_dtype,
+        cd_major=cd_major,
+        acc_dtype=acc_dtype,
+        mma_tiler_mn=mma_tiler_mn,
+        cluster_shape_mn=cluster_shape_mn,
+        sf_vec_size=sf_vec_size,
+        sf_dtype=sf_dtype,
+        vector_f32=vector_f32,
+        discrete_col_sfd=discrete_col_sfd,
+        request=request,
+    )
+
+
+"""
+GroupedGemmSwiglu API with explicit check_support, compile, and execute paths.
+Use this method when running one static configuration for each GroupedGemmSwiglu object.
+"""
+
+
+def _test_grouped_gemm_swiglu_compile_execute(
+    ab_dtype,
+    c_dtype,
+    d_dtype,
+    cd_major,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    discrete_col_sfd,
+    request,
+):
+    try:
+        from cudnn import GroupedGemmSwigluSm100
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        raise e
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    cfg = grouped_gemm_swiglu_init(
+        request,
+        ab_dtype,
+        c_dtype,
+        d_dtype,
+        cd_major,
+        acc_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sf_vec_size,
+        sf_dtype,
+        vector_f32,
+        discrete_col_sfd,
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    inputs = allocate_grouped_gemm_input_tensors(
+        n=cfg["n"],
+        k=cfg["k"],
+        l=cfg["l"],
+        group_m_list=cfg["group_m_list"],
+        ab_dtype=cfg["ab_dtype"],
+        sf_dtype=cfg["sf_dtype"],
+        sf_vec_size=cfg["sf_vec_size"],
+        m_aligned=cfg["m_aligned"],
+        cta_tile_m=cfg["mma_tiler_mn"][0],
+    )
+
+    outputs = allocate_grouped_gemm_output_tensors(
+        tensor_m=inputs["tensor_m"],
+        n=cfg["n"],
+        l=cfg["l"],
+        ab_dtype=cfg["ab_dtype"],
+        c_dtype=cfg["c_dtype"],
+        d_dtype=cfg["d_dtype"],
+        cd_major=cfg["cd_major"],
+        sf_dtype=cfg["sf_dtype"],
+        sf_vec_size=cfg["sf_vec_size"],
+    )
+
+    api = GroupedGemmSwigluSm100(
+        sample_a=inputs["a_tensor"],
+        sample_b=inputs["b_tensor"],
+        sample_c=outputs["c_tensor"],
+        sample_d=outputs["d_tensor"],
+        sample_sfa=inputs["sfa_tensor"],
+        sample_sfb=inputs["sfb_tensor"],
+        sample_tile_idx_to_expert_idx=inputs["tile_idx_to_expert_idx"],
+        sample_num_non_exiting_tiles=inputs["num_non_exiting_tiles"],
+        sample_alpha=inputs["alpha_tensor"],
+        sample_amax=outputs.get("amax_tensor"),
+        sample_d_col=outputs["d_col_tensor"],
+        sample_sfd_row=outputs.get("sfd_row_tensor"),
+        sample_sfd_col=outputs.get("sfd_col_tensor"),
+        sample_norm_const=inputs.get("norm_const_tensor"),
+        sample_prob=inputs.get("prob_tensor"),
+        sample_m_split_cumsum=inputs.get("num_m_split_cumsum_tensor"),
+        acc_dtype=cfg["acc_dtype"],
+        mma_tiler_mn=cfg["mma_tiler_mn"],
+        cluster_shape_mn=cfg["cluster_shape_mn"],
+        sf_vec_size=cfg["sf_vec_size"],
+        vector_f32=cfg["vector_f32"],
+        m_aligned=cfg["m_aligned"],
+        discrete_col_sfd=cfg["discrete_col_sfd"],
+    )
+
+    try:
+        assert api.check_support(), "Unsupported testcase"
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+
+    api.compile(current_stream=stream)
+    api.execute(
+        a_tensor=inputs["a_tensor"],
+        b_tensor=inputs["b_tensor"],
+        c_tensor=outputs["c_tensor"],
+        d_tensor=outputs["d_tensor"],
+        sfa_tensor=inputs["sfa_tensor"],
+        sfb_tensor=inputs["sfb_tensor"],
+        tile_idx_to_expert_idx=inputs["tile_idx_to_expert_idx"],
+        num_non_exiting_tiles=inputs["num_non_exiting_tiles"],
+        alpha_tensor=inputs["alpha_tensor"],
+        d_col_tensor=outputs["d_col_tensor"],
+        sfd_row_tensor=outputs.get("sfd_row_tensor"),
+        sfd_col_tensor=outputs.get("sfd_col_tensor"),
+        norm_const_tensor=inputs.get("norm_const_tensor"),
+        prob_tensor=inputs.get("prob_tensor"),
+        amax_tensor=outputs.get("amax_tensor"),
+        m_split_cumsum=inputs.get("num_m_split_cumsum_tensor"),
+        current_stream=stream,
+    )
+
+    check_ref_grouped_gemm_swiglu(
+        inputs,
+        outputs,
+        cfg,
+        skip_ref=cfg["skip_ref"],
+    )
+
+
+"""
+GroupedGemmSwiglu API with grouped_gemm_swiglu_wrapper:
+Use the wrapper to directly call GroupedGemmSwiglu without explicit setup and compilation.
+"""
+
+
+def _test_grouped_gemm_swiglu_wrapper(
+    ab_dtype,
+    c_dtype,
+    d_dtype,
+    cd_major,
+    acc_dtype,
+    mma_tiler_mn,
+    cluster_shape_mn,
+    sf_vec_size,
+    sf_dtype,
+    vector_f32,
+    discrete_col_sfd,
+    request,
+):
+    try:
+        from cudnn import grouped_gemm_swiglu_wrapper_sm100
+        from cuda.bindings import driver as cuda
+    except ImportError as e:
+        pytest.skip("Environment not supported: cudnn optional dependencies not installed")
+
+    cfg = grouped_gemm_swiglu_init(
+        request,
+        ab_dtype,
+        c_dtype,
+        d_dtype,
+        cd_major,
+        acc_dtype,
+        mma_tiler_mn,
+        cluster_shape_mn,
+        sf_vec_size,
+        sf_dtype,
+        vector_f32,
+        discrete_col_sfd,
+    )
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    inputs = allocate_grouped_gemm_input_tensors(
+        n=cfg["n"],
+        k=cfg["k"],
+        l=cfg["l"],
+        group_m_list=cfg["group_m_list"],
+        ab_dtype=cfg["ab_dtype"],
+        sf_dtype=cfg["sf_dtype"],
+        sf_vec_size=cfg["sf_vec_size"],
+        m_aligned=cfg["m_aligned"],
+        cta_tile_m=cfg["mma_tiler_mn"][0],
+    )
+
+    try:
+        for _ in range(2):  # Run twice to test caching path
+            outputs = grouped_gemm_swiglu_wrapper_sm100(
+                a_tensor=inputs["a_tensor"],
+                b_tensor=inputs["b_tensor"],
+                sfa_tensor=inputs["sfa_tensor"],
+                sfb_tensor=inputs["sfb_tensor"],
+                tile_idx_to_expert_idx=inputs["tile_idx_to_expert_idx"],
+                num_non_exiting_tiles=inputs["num_non_exiting_tiles"],
+                alpha_tensor=inputs["alpha_tensor"],
+                norm_const_tensor=inputs.get("norm_const_tensor"),
+                prob_tensor=inputs.get("prob_tensor"),
+                m_split_cumsum=inputs.get("num_m_split_cumsum_tensor"),
+                acc_dtype=cfg["acc_dtype"],
+                c_dtype=cfg["c_dtype"],
+                d_dtype=cfg["d_dtype"],
+                cd_major=cfg["cd_major"],
+                mma_tiler_mn=cfg["mma_tiler_mn"],
+                cluster_shape_mn=cfg["cluster_shape_mn"],
+                sf_vec_size=cfg["sf_vec_size"],
+                vector_f32=cfg["vector_f32"],
+                m_aligned=cfg["m_aligned"],
+                discrete_col_sfd=cfg["discrete_col_sfd"],
+                current_stream=stream,
+            )
+    except (ValueError, NotImplementedError) as e:
+        pytest.skip(f"Unsupported testcase: {e}")
+
+    check_ref_grouped_gemm_swiglu(
+        inputs,
+        outputs,
+        cfg,
+        skip_ref=cfg["skip_ref"],
+    )
diff --git a/third_party/cudnn-frontend/test/python/fe_api/test_grouped_gemm_swiglu_utils.py b/third_party/cudnn-frontend/test/python/fe_api/test_grouped_gemm_swiglu_utils.py
new file mode 100644
index 00000000..dac7d8be
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/fe_api/test_grouped_gemm_swiglu_utils.py
@@ -0,0 +1,831 @@
+"""
+Utilities and parameterization for Grouped GEMM SwiGLU tests.
+Contains test configuration fixtures, tensor creation, and reference implementations.
+
+Reference: continugous_blockscaled_grouped_gemm_swiglu_quant_fusion.py (lines 3518-4825)
+"""
+
+import torch
+import pytest
+from typing import Optional, Tuple, List, Dict, Any
+from test_fe_api_utils import (
+    ceil_div,
+    compute_reference_amax,
+    create_and_permute_tensor,
+    create_scale_factor_tensor,
+    create_sf_layout_tensor,
+    cvt_sf_MKL_to_M32x4xrm_K4xrk_L,
+)
+from test_low_precision_matmul import (
+    _bfloat16_to_float4_e2m1fn_x2,
+    float4_e2m1fn_x2_to_float32,
+)
+
+# =============================================================================
+# Parameterization Marks
+# =============================================================================
+
+GROUPED_GEMM_SWIGLU_PARAM_MARKS_FP8 = [
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            # torch.float8_e4m3fn,
+            # torch.float8_e5m2,
+            # torch.float16,
+            torch.bfloat16,
+            # torch.float32,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "d_dtype",
+        [
+            torch.float8_e4m3fn,
+            # torch.float8_e5m2,
+            # torch.bfloat16,
+        ],
+    ),
+    pytest.mark.parametrize("cd_major", ["n"]),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+    pytest.mark.parametrize(
+        "mma_tiler_mn",
+        [
+            (256, 256),
+        ],
+    ),
+    pytest.mark.parametrize(
+        "cluster_shape_mn",
+        [
+            (2, 1),
+            (1, 1),
+        ],
+    ),
+    pytest.mark.parametrize("sf_vec_size", [32]),
+    pytest.mark.parametrize(
+        "sf_dtype",
+        [
+            torch.float8_e8m0fnu,
+        ],
+    ),
+    pytest.mark.parametrize("vector_f32", [True, False]),
+    pytest.mark.parametrize("discrete_col_sfd", [True, False]),
+]
+
+GROUPED_GEMM_SWIGLU_PARAM_MARKS_FP4 = [
+    pytest.mark.parametrize(
+        "ab_dtype",
+        [
+            torch.float4_e2m1fn_x2,
+            # torch.uint8,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "c_dtype",
+        [
+            # torch.float16,
+            torch.bfloat16,
+        ],
+    ),
+    pytest.mark.parametrize(
+        "d_dtype",
+        [
+            torch.bfloat16,
+            torch.float32,
+        ],
+    ),
+    pytest.mark.parametrize("cd_major", ["n"]),
+    pytest.mark.parametrize("acc_dtype", [torch.float32]),
+    pytest.mark.parametrize(
+        "mma_tiler_mn",
+        [
+            (256, 256),
+            (128, 128),
+        ],
+    ),
+    pytest.mark.parametrize(
+        "cluster_shape_mn",
+        [
+            (2, 1),
+            (1, 1),
+        ],
+    ),
+    pytest.mark.parametrize("sf_vec_size", [16, 32]),
+    pytest.mark.parametrize(
+        "sf_dtype",
+        [
+            torch.float8_e8m0fnu,
+            torch.float8_e4m3fn,
+        ],
+    ),
+    pytest.mark.parametrize("vector_f32", [True, False]),
+    pytest.mark.parametrize("discrete_col_sfd", [False]),
+]
+
+
+def with_grouped_gemm_swiglu_params_fp4(func):
+    """Decorator to apply grouped GEMM SwiGLU FP4 test parameters."""
+    for mark in reversed(GROUPED_GEMM_SWIGLU_PARAM_MARKS_FP4):
+        func = mark(func)
+    return func
+
+
+def with_grouped_gemm_swiglu_params_fp8(func):
+    """Decorator to apply grouped GEMM SwiGLU FP8 test parameters."""
+    for mark in reversed(GROUPED_GEMM_SWIGLU_PARAM_MARKS_FP8):
+        func = mark(func)
+    return func
+
+
+# =============================================================================
+# Configuration Initialization
+# =============================================================================
+
+
+def grouped_gemm_swiglu_init(
+    request,
+    ab_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    d_dtype: torch.dtype,
+    cd_major: str,
+    acc_dtype: torch.dtype,
+    mma_tiler_mn: Tuple[int, int],
+    cluster_shape_mn: Tuple[int, int],
+    sf_vec_size: int,
+    sf_dtype: torch.dtype,
+    vector_f32: bool = False,
+    discrete_col_sfd: bool = False,
+) -> Dict[str, Any]:
+    """Initialize configuration for Grouped GEMM SwiGLU tests.
+
+    :param request: pytest request object
+    :param ab_dtype: Data type for A and B tensors
+    :param c_dtype: Data type for intermediate C tensor (always bfloat16)
+    :param d_dtype: Data type for output D tensor (fp8 when ab is fp8, bf16 when ab is fp4)
+    :param cd_major: Major dimension for output C and D tensors
+    :param acc_dtype: Accumulator data type
+    :param mma_tiler_mn: MMA tiler shape
+    :param cluster_shape_mn: Cluster shape
+    :param sf_vec_size: Scale factor vector size
+    :param sf_dtype: Scale factor data type
+    :param vector_f32: Use vectorized f32 operations
+    :param discrete_col_sfd: Generate discrete col-major scale factor tensor
+    :return: Configuration dictionary
+    """
+    major, _ = torch.cuda.get_device_capability()
+    if major < 10:
+        pytest.skip(f"Environment not supported: requires compute capability >= 10, found {major}")
+
+    # Parse CLI options
+    nkl_str = request.config.getoption("--grouped-gemm-nkl", default=None)
+    group_m_str = request.config.getoption("--grouped-gemm-group-m", default=None)
+    m_aligned_opt = request.config.getoption("--grouped-gemm-m-aligned", default=None)
+    skip_ref = request.config.getoption("--grouped-gemm-skip-ref", default=False)
+
+    # Default values
+    if nkl_str is not None:
+        n, k, l = [int(x.strip()) for x in nkl_str.split(",")]
+    else:
+        n, k, l = 512, 512, 4
+
+    if group_m_str is not None:
+        group_m_list = [int(x.strip()) for x in group_m_str.split(",")]
+    else:
+        # Default: equal M values per group
+        group_m_list = [256] * l
+
+    m_aligned = int(m_aligned_opt) if m_aligned_opt is not None else mma_tiler_mn[0]
+
+    config = {
+        "n": n,
+        "k": k,
+        "l": l,
+        "group_m_list": group_m_list,
+        "m_aligned": m_aligned,
+        "mma_tiler_mn": mma_tiler_mn,
+        "cluster_shape_mn": cluster_shape_mn,
+        "ab_dtype": ab_dtype,
+        "c_dtype": c_dtype,
+        "d_dtype": d_dtype,
+        "cd_major": cd_major,
+        "acc_dtype": acc_dtype,
+        "sf_vec_size": sf_vec_size,
+        "sf_dtype": sf_dtype,
+        "vector_f32": vector_f32,
+        "skip_ref": skip_ref,
+        "discrete_col_sfd": discrete_col_sfd,
+    }
+
+    return config
+
+
+# =============================================================================
+# Helper Functions
+# =============================================================================
+
+
+def get_dtype_rcp_limits(dtype: torch.dtype) -> float:
+    """Get reciprocal of max value for quantization."""
+    if dtype == torch.float8_e5m2:
+        return 1 / 128.0
+    elif dtype == torch.float8_e4m3fn:
+        return 1 / 448.0
+    elif dtype in {torch.float4_e2m1fn_x2, torch.uint8}:
+        return 1 / 6.0
+    return 1.0
+
+
+def create_mask(
+    group_m_list: List[int],
+    cta_tile_m: int,
+    m_aligned: int = 128,
+    permuted_m: Optional[int] = None,
+) -> Tuple[int, List[int], torch.Tensor, torch.Tensor]:
+    """Create mask and group mapping for contiguous grouped GEMM.
+
+    :param group_m_list: List of M values for each group (will be aligned to m_aligned)
+    :param cta_tile_m: CTA tile size in M dimension (from mma_tiler_mn[0])
+    :param m_aligned: Alignment requirement for group M dimension
+    :param permuted_m: Optional padded M dimension for CUDA graph support
+
+    Note: m_aligned should be a multiple of the CTA tile M dimension to prevent
+          a single tile from spanning multiple groups, which would cause incorrect
+          B matrix access.
+
+    Note: For cuda_graph support, set permuted_m to the pre-calculated padded size:
+          permuted_m = m * topK + num_local_experts * (256 - 1)
+          Example: 4096*8 + (256/32)*255 = 34808
+          Only the actual valid rows (aligned_groupm[0]+aligned_groupm[1]+...) contain
+          valid data. The kernel will exit when tile_idx >= num_non_exiting_tiles.
+
+    :return: Tuple of (valid_m, aligned_group_m_list, tile_idx_to_expert_idx, num_non_exiting_tiles, num_m_split_cumsum)
+             - tile_idx_to_expert_idx: shape (permuted_m/cta_tile_m,) if permuted_m provided,
+               else (valid_m/cta_tile_m,)
+             - num_non_exiting_tiles: scalar value = valid_m/cta_tile_m
+             - num_m_split_cumsum: cumulative sum of aligned_group_m_list
+    """
+    valid_m = 0
+    aligned_group_m_list = []
+    tile_idx_to_expert_idx = []
+    m_split_cumsum = []
+    m_split_cumsum.append(valid_m)
+
+    for i, group_m in enumerate(group_m_list):
+        aligned_group_m = ((group_m + m_aligned - 1) // m_aligned) * m_aligned
+        valid_m += aligned_group_m
+        aligned_group_m_list.append(aligned_group_m)
+
+        # Calculate number of tiles for this group based on CTA tile M size
+        # Each tile covers cta_tile_m rows in M dimension
+        num_tiles_in_group = aligned_group_m // cta_tile_m
+        # Add expert_idx for each tile in this group
+        tile_idx_to_expert_idx.extend([i] * num_tiles_in_group)
+        m_split_cumsum.append(valid_m)
+
+    # Compute num_non_exiting_tiles (number of valid tiles in M dimension)
+    num_non_exiting_tiles = len(tile_idx_to_expert_idx)
+
+    # Apply padding if requested (for cuda_graph support)
+    if permuted_m is not None:
+        if permuted_m < valid_m:
+            raise ValueError(f"permuted_m ({permuted_m}) must be >= valid_m ({valid_m}). " f"Cannot pad to a smaller size.")
+        if permuted_m > valid_m:
+            # Calculate how many padding tiles are needed based on CTA tile M size
+            num_padding_tiles = (permuted_m - valid_m) // cta_tile_m
+            # Pad with large negative value (these tiles won't be accessed due to
+            # num_non_exiting_tiles check)
+            tile_idx_to_expert_idx.extend([int(-2e9)] * num_padding_tiles)
+
+    # Convert to tensors
+    tile_idx_to_expert_idx_tensor = torch.tensor(tile_idx_to_expert_idx, device="cuda", dtype=torch.int32)
+    num_non_exiting_tiles_tensor = torch.tensor([num_non_exiting_tiles], device="cuda", dtype=torch.int32)
+    num_m_split_cumsum_tensor = torch.tensor(m_split_cumsum, device="cuda", dtype=torch.int32)
+
+    return (
+        valid_m,
+        aligned_group_m_list,
+        tile_idx_to_expert_idx_tensor,
+        num_non_exiting_tiles_tensor,
+        num_m_split_cumsum_tensor,
+    )
+
+
+# =============================================================================
+# Tensor Allocation
+# =============================================================================
+
+
+def allocate_grouped_gemm_input_tensors(
+    n: int,
+    k: int,
+    l: int,
+    group_m_list: List[int],
+    ab_dtype: torch.dtype,
+    sf_dtype: torch.dtype,
+    sf_vec_size: int,
+    m_aligned: int,
+    cta_tile_m: int,
+    permuted_m: Optional[int] = None,
+    norm_const: float = 1.0,
+    device: str = "cuda",
+) -> Dict[str, Any]:
+    """Allocate input tensors for grouped GEMM SwiGLU.
+
+    Matches the original create_tensors() implementation.
+
+    :return: Dictionary containing all input tensors and metadata
+    """
+
+    (
+        valid_m,
+        aligned_group_m_list,
+        tile_idx_to_expert_idx,
+        num_non_exiting_tiles,
+        num_m_split_cumsum,
+    ) = create_mask(group_m_list, cta_tile_m, m_aligned, permuted_m)
+
+    tensor_m = permuted_m if permuted_m is not None else valid_m
+
+    # Note: a and b tensors are always K-major
+    a_ref, a_tensor = create_and_permute_tensor(1, tensor_m, k, False, ab_dtype)
+    b_ref, b_tensor = create_and_permute_tensor(l, n, k, False, ab_dtype)
+
+    sfa_ref, sfa_tensor = create_scale_factor_tensor(1, tensor_m, k, sf_vec_size, sf_dtype)
+    sfb_ref, sfb_tensor = create_scale_factor_tensor(l, n, k, sf_vec_size, sf_dtype)
+
+    alpha_tensor = torch.randint(-2, 2, (l,), dtype=torch.float32, device=device).float()
+
+    prob_tensor = torch.randint(-2, 2, (tensor_m, 1, 1), dtype=torch.float32, device=device).float()
+
+    result = {
+        "a_tensor": a_tensor,
+        "a_ref": a_ref,
+        "b_tensor": b_tensor,
+        "b_ref": b_ref,
+        "sfa_tensor": sfa_tensor,
+        "sfa_ref": sfa_ref,
+        "sfb_tensor": sfb_tensor,
+        "sfb_ref": sfb_ref,
+        "alpha_tensor": alpha_tensor,
+        "prob_tensor": prob_tensor,
+        "tile_idx_to_expert_idx": tile_idx_to_expert_idx,
+        "num_non_exiting_tiles": num_non_exiting_tiles,
+        "num_m_split_cumsum_tensor": num_m_split_cumsum,
+        "aligned_group_m_list": aligned_group_m_list,
+        "valid_m": valid_m,
+        "tensor_m": tensor_m,
+        "norm_const_tensor": None,
+    }
+
+    # Norm constant tensor
+    if ab_dtype in [torch.float8_e4m3fn, torch.float8_e5m2] and sf_dtype in [
+        torch.float8_e8m0fnu,
+        torch.float8_e4m3fn,
+    ]:
+        result["norm_const_tensor"] = torch.tensor([norm_const], dtype=torch.float32, device=device)
+
+    return result
+
+
+def allocate_grouped_gemm_output_tensors(
+    tensor_m: int,
+    n: int,
+    l: int,
+    ab_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    d_dtype: torch.dtype,
+    cd_major: str,
+    sf_dtype: torch.dtype,
+    sf_vec_size: int = 16,
+    device: str = "cuda",
+) -> Dict[str, Any]:
+    """Allocate output tensors for grouped GEMM SwiGLU.
+
+    Matches the original create_tensors() implementation.
+
+    :param c_dtype: Data type for intermediate C tensor (always bfloat16)
+    :param d_dtype: Data type for output D tensor (fp8 when ab is fp8, bf16 when ab is fp4)
+    :return: Dictionary containing all output tensors
+    """
+    n_out = n // 2  # After SwiGLU
+
+    _, c_tensor = create_and_permute_tensor(1, tensor_m, n, cd_major == "m", c_dtype)
+    _, d_tensor = create_and_permute_tensor(1, tensor_m, n_out, cd_major == "m", d_dtype)
+    _, d_col_tensor = create_and_permute_tensor(1, tensor_m, n_out, cd_major == "m", d_dtype)
+
+    result = {
+        "c_tensor": c_tensor,
+        "d_tensor": d_tensor,
+        "d_col_tensor": d_col_tensor,
+        "sfd_row_tensor": None,
+        "sfd_col_tensor": None,
+    }
+
+    if d_dtype in [torch.bfloat16, torch.float16]:
+        result["amax_tensor"] = torch.full((l, 1), float("-inf"), dtype=torch.float32, device=device)
+
+    if ab_dtype in [torch.float8_e4m3fn, torch.float8_e5m2] and sf_dtype in [
+        torch.float8_e8m0fnu,
+        torch.float8_e4m3fn,
+    ]:  # generate_sfd
+        sfd_row_ref, sfd_row_tensor = create_scale_factor_tensor(1, tensor_m, n_out, sf_vec_size, sf_dtype)
+        result["sfd_row_tensor"] = sfd_row_tensor
+        result["sfd_row_ref"] = sfd_row_ref
+
+        sfd_col_ref, sfd_col_tensor = create_scale_factor_tensor(1, n_out, tensor_m, sf_vec_size, sf_dtype)
+        result["sfd_col_tensor"] = sfd_col_tensor
+        result["sfd_col_ref"] = sfd_col_ref
+
+    return result
+
+
+# =============================================================================
+# Reference Implementations
+# =============================================================================
+
+
+def run_grouped_gemm_swiglu_ref(
+    a_ref: torch.Tensor,
+    b_ref: torch.Tensor,
+    sfa_ref: torch.Tensor,
+    sfb_ref: torch.Tensor,
+    alpha_tensor: torch.Tensor,
+    prob_tensor: torch.Tensor,
+    aligned_group_m_list: List[int],
+    valid_m: int,
+    generate_amax: bool = False,
+    generate_sfd: bool = False,
+    norm_const_tensor: Optional[torch.Tensor] = None,
+    c_dtype: torch.dtype = torch.bfloat16,
+    d_dtype: torch.dtype = torch.float32,
+    sf_vec_size: int = 16,
+    sf_dtype: torch.dtype = torch.float8_e8m0fnu,
+) -> torch.Tensor:
+    """Run reference implementation for grouped GEMM SwiGLU.
+
+    Matches the reference checking in continugous_blockscaled_grouped_gemm_swiglu_quant_fusion.py
+    (lines 4113-4179)
+
+    :param a_ref: A tensor (tensor_m, k, 1) in float32
+    :param b_ref: B tensor (n, k, l) in float32
+    :param sfa_ref: Scale factor A tensor (tensor_m, k, 1) in float32
+    :param sfb_ref: Scale factor B tensor (n, k, l) in float32
+    :param alpha_tensor: Per-group alpha scaling (l,)
+    :param prob_tensor: Per-row probability scaling (tensor_m, 1, 1)
+    :param aligned_group_m_list: Aligned M values per group
+    :param valid_m: Total valid M dimension
+    :param generate_amax: Generate AMAX tensor
+    :param generate_sfd: Generate SFD tensor
+    :param norm_const_tensor: Normalization constant tensor (1,)
+    :param c_dtype: Intermediate C tensor dtype (always bfloat16)
+    :param d_dtype: Output D tensor dtype
+    :param sf_vec_size: Scale factor vector size
+    :param sf_dtype: Scale factor dtype
+    :return: Reference output tensor (valid_m, n_out, 1)
+    """
+    n, k, l = b_ref.shape
+    n_out = n // 2
+    ref_tensors = {}
+
+    # Step 1: Compute GEMM per group with scale factors
+    ref = torch.empty((1, valid_m, n), dtype=torch.float32, device=a_ref.device)
+    start = 0
+    for i, group_m in enumerate(aligned_group_m_list):
+        end = start + group_m
+        res_a = torch.einsum("mk,mk->mk", a_ref[start:end, :, 0], sfa_ref[start:end, :, 0])
+        res_b = torch.einsum("nk,nk->nk", b_ref[:, :, i], sfb_ref[:, :, i])
+        ref[0, start:end, :] = torch.einsum("mk,nk->mn", res_a, res_b)
+        start = end
+    ref = ref.permute((1, 2, 0))
+
+    # Step 2: Apply alpha per group
+    start = 0
+    for i, group_m in enumerate(aligned_group_m_list):
+        end = start + group_m
+        ref[start:end, :, 0] = ref[start:end, :, 0] * alpha_tensor[i].item()
+        start = end
+
+    ref_tensors["c_ref"] = ref.clone()
+
+    # Step 3: Apply SwiGLU with interleaved block layout
+    group = 32
+    assert n % group == 0, "N must be divisible by 32 for GLU block grouping"
+    num_blocks = n // group
+    assert num_blocks % 2 == 0, "Number of 32-col blocks must be even (pairs of input/gate)"
+
+    cols = torch.arange(n, device=ref.device, dtype=torch.long)
+    block_cols = cols.view(num_blocks, group)
+    # up: blocks 0,2,4,6,... (even blocks)
+    # gate: blocks 1,3,5,7,... (odd blocks)
+    up_idx = block_cols[0::2].reshape(-1)
+    gate_idx = block_cols[1::2].reshape(-1)
+    ref_up = ref.index_select(1, up_idx)
+    ref_gate = ref.index_select(1, gate_idx)
+
+    # SwiGLU: up * (gate * sigmoid(gate))
+    ref_gate = ref_gate * torch.sigmoid(ref_gate)
+    ref_after_swiglu = ref_up * ref_gate
+
+    # Step 4: Apply prob
+    ref_after_swiglu = ref_after_swiglu * prob_tensor.expand(-1, n_out, -1)
+    ref_tensors["d_ref"] = ref_after_swiglu.clone()
+
+    if generate_amax:
+        amax_ref = torch.empty((l,), dtype=torch.float32, device=a_ref.device)
+        start = 0
+        for i, group_m in enumerate(aligned_group_m_list):
+            end = start + group_m
+            amax_ref[i] = compute_reference_amax(ref_after_swiglu[start:end, :, 0].clone())
+            start = end
+        ref_tensors["amax_ref"] = amax_ref
+
+    if generate_sfd:
+        try:
+            from cutlass.cute.runtime import from_dlpack
+            import cutlass.cute as cute
+            from cudnn.datatypes import _convert_to_cutlass_data_type
+        except ImportError:
+            pytest.skip("CUTLASS not available for scale factor conversion")
+
+        norm_const = norm_const_tensor[0].item()
+
+        # 1. Compute reference SFDRow (m, sfn, l) in fp32
+        sfn = ceil_div(n_out, sf_vec_size)
+        # Resahpe ref to (l, m, sfn, sf_vec_size)
+        ref_for_sf = ref_after_swiglu.permute(2, 0, 1).contiguous()  # (l, m, n)
+        # l is involved in valid_m
+        ref_for_sf = ref_for_sf.view(1, valid_m, sfn, sf_vec_size)
+        # Take abs max over sf_vec_size dimension
+        ref_for_sf, _ = torch.abs(ref_for_sf).max(dim=3)  # (l, m, sfn)
+        # Multiply by norm_const and rcp_limits
+        ref_sfd_row_f32 = ref_for_sf * norm_const * get_dtype_rcp_limits(d_dtype)
+        # Permute to (m, sfn, l)
+        ref_sfd_row_f32 = ref_sfd_row_f32.permute(1, 2, 0)
+
+        # Convert fp32 -> f8 -> fp32 for ref_sfd_row_f32
+        ref_sfd_row_f8_torch = torch.empty(*(1, valid_m, sfn), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+        ref_sfd_row_f8 = from_dlpack(ref_sfd_row_f8_torch, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        ref_sfd_row_f8.element_type = _convert_to_cutlass_data_type(sf_dtype)
+        ref_sfd_row_f32_device = ref_sfd_row_f32.cuda()
+        ref_sfd_row_f32_tensor = from_dlpack(ref_sfd_row_f32_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        cute.testing.convert(ref_sfd_row_f32_tensor, ref_sfd_row_f8)
+        cute.testing.convert(ref_sfd_row_f8, ref_sfd_row_f32_tensor)
+        ref_sfd_row_f32 = ref_sfd_row_f32_device.cpu()
+
+        # 2. Convert ref_sfd_row_f32 to scale factor layout and compare with kernel sfd tensor
+        ref_sfd_row_f32_cute_torch_tensor_cpu, _ = create_sf_layout_tensor(1, valid_m, n_out, sf_vec_size)
+
+        # convert ref_after_swiglu f32 tensor to cute f32 tensor
+        cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+            from_dlpack(ref_sfd_row_f32),
+            from_dlpack(ref_sfd_row_f32_cute_torch_tensor_cpu),
+        )
+        ref_sfd_row_f32 = ref_sfd_row_f32.cuda()
+        ref_tensors["sfd_row_ref"] = ref_sfd_row_f32_cute_torch_tensor_cpu.clone()
+
+        # 3. Quantized output with scale factor
+        # Compute reciprocal of ref_sfd_row_f32 and multiply by norm_const
+        ref_sfd_row_rcp = norm_const * ref_sfd_row_f32.reciprocal()
+        ref_sfd_row_rcp = torch.clamp(ref_sfd_row_rcp, max=3.40282346638528859812e38)
+        # Expand the sfn dimension by repeating each value sf_vec_size times
+        # ref_sfd_row_rcp: (m, sfn, l) -> (m, sfn, sf_vec_size, l) -> (m, n, l)
+        ref_sfd_row_rcp_expanded = ref_sfd_row_rcp.unsqueeze(2).expand(valid_m, sfn, sf_vec_size, 1)
+        ref_sfd_row_rcp_expanded = ref_sfd_row_rcp_expanded.reshape(valid_m, sfn * sf_vec_size, 1)
+        # Trim to exact n dimension if needed
+        ref_sfd_row_rcp_expanded = ref_sfd_row_rcp_expanded[:, :n_out, :]
+
+        # Apply scale to reference output: ref = ref * ref_sfd_row_rcp
+        ref_after_row_quant = torch.einsum("mnl,mnl->mnl", ref_after_swiglu, ref_sfd_row_rcp_expanded)
+        ref_tensors["d_ref"] = ref_after_row_quant.clone()
+
+        # Col Quantized SFD tensor
+        # 1. Compute reference SFDCol (m, sfn, l) in fp32
+        ref_after_swiglu = ref_after_swiglu.permute(2, 1, 0).contiguous().permute(1, 2, 0)
+        n_after_swiglu = ref_after_swiglu.shape[1]
+        sfn = ceil_div(n_after_swiglu, sf_vec_size)
+        valid_m = ref_after_swiglu.shape[0]
+        # Reshape ref to (l, m, sfn, sf_vec_size)
+        ref_for_sf = ref_after_swiglu.permute(2, 0, 1).contiguous()  # (l, m, n)
+        # l is involved in valid_m
+        ref_for_sf = ref_for_sf.view(1, valid_m, sfn, sf_vec_size)
+        # Take abs max over sf_vec_size dimension
+        ref_for_sf, _ = torch.abs(ref_for_sf).max(dim=3)  # (l, m, sfn)
+        # Multiply by norm_const and rcp_limits
+        ref_sfd_row_f32 = ref_for_sf * norm_const * get_dtype_rcp_limits(d_dtype)
+        # Permute to (m, sfn, l)
+        ref_sfd_row_f32 = ref_sfd_row_f32.permute(1, 2, 0)
+
+        # Convert fp32 -> f8 -> fp32 for ref_sfd_row_f32
+        ref_sfd_row_f8_torch = torch.empty(*(1, valid_m, sfn), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+        ref_sfd_row_f8 = from_dlpack(ref_sfd_row_f8_torch, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        ref_sfd_row_f8.element_type = _convert_to_cutlass_data_type(sf_dtype)
+        ref_sfd_row_f32_device = ref_sfd_row_f32.cuda()
+        ref_sfd_row_f32_tensor = from_dlpack(ref_sfd_row_f32_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        cute.testing.convert(ref_sfd_row_f32_tensor, ref_sfd_row_f8)
+        cute.testing.convert(ref_sfd_row_f8, ref_sfd_row_f32_tensor)
+        ref_sfd_row_f32 = ref_sfd_row_f32_device.cpu()
+
+        # 2. Convert ref_sfd_row_f32 to scale factor layout and compare with kernel sfd tensor
+        ref_sfd_row_f32_cute_torch_tensor_cpu, _ = create_sf_layout_tensor(1, valid_m, n_after_swiglu, sf_vec_size)
+
+        # convert ref_after_swiglu f32 tensor to cute f32 tensor
+        cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
+            from_dlpack(ref_sfd_row_f32),
+            from_dlpack(ref_sfd_row_f32_cute_torch_tensor_cpu),
+        )
+        ref_sfd_row_f32 = ref_sfd_row_f32.cuda()
+        ref_tensors["sfd_col_ref"] = ref_sfd_row_f32_cute_torch_tensor_cpu.clone()
+
+        # 3. Quantized output with scale factor
+        # Compute reciprocal of ref_sfd_row_f32 and multiply by norm_const
+        ref_sfd_row_rcp = norm_const * ref_sfd_row_f32.reciprocal()
+        ref_sfd_row_rcp = torch.clamp(ref_sfd_row_rcp, max=3.40282346638528859812e38)
+        # Expand the sfn dimension by repeating each value sf_vec_size times
+        # ref_sfd_row_rcp: (m, sfn, l) -> (m, sfn, sf_vec_size, l) -> (m, n, l)
+        ref_sfd_row_rcp_expanded = ref_sfd_row_rcp.unsqueeze(2).expand(valid_m, sfn, sf_vec_size, 1)
+        ref_sfd_row_rcp_expanded = ref_sfd_row_rcp_expanded.reshape(valid_m, sfn * sf_vec_size, 1)
+        # Trim to exact n dimension if needed
+        ref_sfd_row_rcp_expanded = ref_sfd_row_rcp_expanded[:, :n_after_swiglu, :]
+
+        # Apply scale to reference output: ref = ref * ref_sfd_row_rcp
+        ref_after_row_quant = torch.einsum("mnl,mnl->mnl", ref_after_swiglu, ref_sfd_row_rcp_expanded)
+
+        # Convert ref_after_row_quant : f32 -> f8 -> f32
+        ref_ = torch.empty(*(1, valid_m, n_after_swiglu), dtype=torch.uint8, device="cuda").permute(1, 2, 0)
+        ref_ = from_dlpack(ref_, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        ref_.element_type = _convert_to_cutlass_data_type(d_dtype)
+        ref_device = ref_after_row_quant.cuda()
+        ref_tensor = from_dlpack(ref_device, assumed_align=16).mark_layout_dynamic(leading_dim=1)
+        cute.testing.convert(ref_tensor, ref_)
+        cute.testing.convert(ref_, ref_tensor)
+
+        ref_tensors["d_col_ref"] = ref_device.clone().permute(1, 0, 2)
+
+    return ref_tensors
+
+
+# =============================================================================
+# Reference Checking
+# =============================================================================
+
+
+def check_ref_grouped_gemm_swiglu(
+    inputs: Dict[str, Any],
+    outputs: Dict[str, Any],
+    cfg: Dict[str, Any],
+    atol: float = 1e-1,
+    rtol: float = 1e-2,
+    skip_ref: bool = False,
+) -> None:
+    """Check grouped GEMM SwiGLU result against reference.
+
+    :param inputs: Dictionary of input tensors (from allocate_grouped_gemm_input_tensors)
+    :param outputs: Dictionary of output tensors (from allocate_grouped_gemm_output_tensors)
+    :param cfg: Configuration dictionary (from grouped_gemm_swiglu_init)
+    :param atol: Absolute tolerance
+    :param rtol: Relative tolerance
+    :param skip_ref: Skip reference check if True
+    """
+    if skip_ref:
+        print("Skipping reference check")
+        return
+
+    # Run reference
+    ref_tensors = run_grouped_gemm_swiglu_ref(
+        a_ref=inputs["a_ref"].to(torch.float32),
+        b_ref=inputs["b_ref"].to(torch.float32),
+        sfa_ref=inputs["sfa_ref"].to(torch.float32),
+        sfb_ref=inputs["sfb_ref"].to(torch.float32),
+        alpha_tensor=inputs["alpha_tensor"],
+        prob_tensor=inputs["prob_tensor"],
+        aligned_group_m_list=inputs["aligned_group_m_list"],
+        valid_m=inputs["valid_m"],
+        generate_amax=(outputs.get("amax_tensor") is not None),
+        generate_sfd=(outputs.get("sfd_row_tensor") is not None),
+        norm_const_tensor=inputs.get("norm_const_tensor"),
+        c_dtype=cfg["c_dtype"],
+        d_dtype=cfg["d_dtype"],
+        sf_vec_size=cfg["sf_vec_size"],
+        sf_dtype=cfg["sf_dtype"],
+    )
+
+    torch.cuda.synchronize()
+
+    c_gpu = outputs["c_tensor"][: inputs["valid_m"]]
+    c_ref = ref_tensors["c_ref"]
+    torch.testing.assert_close(
+        c_gpu.cpu().float(),
+        c_ref.cpu().to(cfg["c_dtype"]).to(torch.float32),
+        atol=atol,
+        rtol=rtol,
+    )
+
+    if cfg["d_dtype"] in [torch.float32, torch.float16, torch.bfloat16]:
+        if ref_tensors.get("amax_ref") is not None:
+            amax_gpu = outputs["amax_tensor"]
+            amax_ref = ref_tensors["amax_ref"]
+            torch.testing.assert_close(
+                amax_gpu.cpu().squeeze(),
+                amax_ref.cpu(),
+                atol=atol,
+                rtol=rtol,
+            )
+
+        d_gpu = outputs["d_tensor"][: inputs["valid_m"]]
+        d_ref = ref_tensors["d_ref"]
+        torch.testing.assert_close(
+            d_gpu.cpu().float(),
+            d_ref.cpu().to(cfg["d_dtype"]).to(torch.float32),
+            atol=atol,
+            rtol=rtol,
+        )
+    elif cfg["d_dtype"] in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        if ref_tensors.get("sfd_row_ref") is not None:  # generate_sfd
+            # sfd_row_ref
+            sfd_row_gpu = outputs["sfd_row_tensor"]
+            sfd_row_ref = ref_tensors["sfd_row_ref"]
+            torch.testing.assert_close(
+                sfd_row_gpu.cpu().float(),
+                sfd_row_ref.cpu().to(torch.float32),
+                atol=atol,
+                rtol=rtol,
+            )
+
+            # d_ref (row)
+            d_gpu = outputs["d_tensor"]
+            d_ref = ref_tensors["d_ref"]
+            torch.testing.assert_close(
+                d_gpu.cpu().float(),
+                d_ref.to(cfg["d_dtype"]).to(torch.float32).cpu(),
+                atol=atol,
+                rtol=rtol,
+            )
+
+            # sfd_col
+            if cfg["discrete_col_sfd"]:
+                # discrete col sfd
+                group_m_list = inputs["aligned_group_m_list"]
+                group_n_tile_list = [group // 128 for group in group_m_list]
+                m_tile = ref_tensors["sfd_col_ref"].shape[2]
+
+                sfd_col_torch_gpu_f8 = outputs["sfd_col_tensor"].cpu().to(torch.float32)
+                sfd_col_ref_f32 = ref_tensors["sfd_col_ref"].cpu().to(torch.float32)
+
+                res_real_idx = 0
+                cumsum_n = 0
+                total_n = sum(group_n_tile_list)
+                for n_tile in group_n_tile_list:
+                    for m_idx in range(m_tile):
+                        for n_idx in range(n_tile):
+                            res_real_m_idx = res_real_idx // total_n
+                            res_real_n_idx = res_real_idx % total_n
+
+                            ref_real_n_idx = n_idx + cumsum_n
+                            ref_slice = sfd_col_ref_f32[:, :, m_idx, :, ref_real_n_idx, :]
+                            res_slice = sfd_col_torch_gpu_f8[:, :, res_real_m_idx, :, res_real_n_idx, :]
+                            torch.testing.assert_close(
+                                ref_slice,
+                                res_slice,
+                                atol=atol,
+                                rtol=rtol,
+                            )
+                            res_real_idx += 1
+                    cumsum_n += n_tile
+            else:
+                # contiguous col sfd
+                sfd_col_gpu = outputs["sfd_col_tensor"]
+                sfd_col_ref = ref_tensors["sfd_col_ref"]
+                torch.testing.assert_close(
+                    sfd_col_gpu.cpu().float(),
+                    sfd_col_ref.cpu().to(torch.float32),
+                    atol=atol,
+                    rtol=rtol,
+                )
+
+            # d_col_ref
+            d_col_gpu = outputs["d_col_tensor"]
+            d_col_ref = ref_tensors["d_col_ref"]
+            torch.testing.assert_close(
+                d_col_gpu.cpu().float(),
+                d_col_ref.to(cfg["d_dtype"]).to(torch.float32).cpu(),
+                atol=atol,
+                rtol=rtol,
+            )
+        else:
+            # Note: This is outside support surface
+            d_gpu = outputs["d_tensor"][: inputs["valid_m"]]
+            d_ref = ref_tensors["d_ref"][: inputs["valid_m"]]
+            torch.testing.assert_close(
+                d_gpu.cpu().float(),
+                d_ref.cpu().to(cfg["d_dtype"]).to(torch.float32),
+                atol=atol,
+                rtol=rtol,
+            )
+
+    else:
+        raise NotImplementedError(f"Unsupported dtype: {cfg['d_dtype']}")
diff --git a/third_party/cudnn-frontend/test/python/pytest.ini b/third_party/cudnn-frontend/test/python/pytest.ini
new file mode 100644
index 00000000..b87edc3c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/pytest.ini
@@ -0,0 +1,9 @@
+[pytest]
+markers = 
+    L0: specifies L0 level (use -m L0)
+    L1: specifies L1 level (use -m L1)
+    L2: specifies L2 level (use -m L2)
+    L3: specifies L3 level (use -m L3)
+    L4: specifies L4 level (use -m L4)
+addopts = 
+    -m L0 --tb=short --no-header
diff --git a/third_party/cudnn-frontend/test/python/sdpa/blocked.py b/third_party/cudnn-frontend/test/python/sdpa/blocked.py
new file mode 100644
index 00000000..2f5526db
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/blocked.py
@@ -0,0 +1,112 @@
+# Blocked tests configuration
+# Format: "test_name": {"sms": ["SM_90", "SM_100"], "cudnn_versions": ["91100"]}
+# - sms: List of GPU architectures to block on (e.g., "SM_90", "SM_100")
+# - cudnn_versions: List of cuDNN versions to block on (e.g., "91100")
+# If a field is None or missing, the test is blocked on all values for that field.
+
+# fmt: off
+
+BLOCKED_TESTS = {
+    # Currently empty - add blocked tests as needed
+    # Example entries:
+    # "test_sdpa_random_bwd[test64]": {"sms": ["SM_90", "SM_100"], "cudnn_versions": ["91100"]},
+    # "test_sdpa_random_bwd[test65]": {"sms": ["SM_100"], "cudnn_versions": ["91100", "91000"]},
+    # "test_sdpa_random_bwd[test66]": {"sms": ["SM_80"]},
+    # "test_sdpa_random_bwd[test67]": {"cudnn_versions": ["90000"]},
+    # "test_sdpa_random_bwd[test68]": {},
+
+    # FP8 forward edge cases producing NaN - blocked until investigated
+    # Original test_sdpa_fp8.py only tested: h_q=h_k=h_v=4, s_kv=256/1024, d_qk=64/128/192, d_v=64/128
+    #
+    # | Test    | s_q | s_kv | h_q | h_k | d_qk | d_v | dtype   | otype  | Issue                         |
+    # |---------|-----|------|-----|-----|------|-----|---------|--------|-------------------------------|
+    # | test14  |  89 |  569 |   8 |   1 |  128 | 128 | e5m2    | fp16   | e5m2+GQA+non-aligned s_q      |
+    # | test17  | 207 |  207 |   9 |   9 |  120 | 120 | e4m3    | e4m3   | d_qk=120 not multiple of 16   |
+    # | test18  | 766 |  766 |  13 |   1 |  192 | 128 | e5m2    | fp16   | e5m2+d_qk=192+GQA             |
+    # | test21  |   1 |  936 |  10 |   5 |   64 |  64 | e4m3    | e5m2   | s_q=1 + GQA + mixed fp8 out   |
+    # | test40  |   1 |  552 |   3 |   3 |   64 |  64 | e4m3    | e4m3   | s_q=1 + MHA                   |
+    # | test41  |   1 |  225 |  11 |  11 |   64 |  64 | e4m3    | fp16   | s_q=1 + MHA                   |
+    # | test42  | 896 |  896 |  13 |  13 |  192 | 128 | e4m3    | fp16   | d_qk=192 + large MHA          |
+    # | test57  |   1 |  949 |   8 |   8 |   64 |  64 | e5m2    | e4m3   | s_q=1 + MHA + mixed fp8 out   |
+    # | test64  |   1 |  489 |   9 |   1 |   64 |  64 | e5m2    | fp16   | s_q=1 + GQA + e5m2            |
+    # | test73  |   1 |  321 |   9 |   1 |   64 |  64 | e4m3    | fp16   | s_q=1 + GQA                   |
+    # | test86  |   1 |  375 |   8 |   2 |   64 |  64 | e5m2    | fp16   | s_q=1 + GQA + e5m2            |
+    # | test90  |   1 |  213 |  12 |   3 |   64 |  64 | e4m3    | fp16   | s_q=1 + GQA                   |
+    # | test96  |   1 |  132 |  13 |   1 |   64 |  64 | e4m3    | fp16   | s_q=1 + GQA                   |
+    # | test128 |   1 |  707 |  10 |   1 |   64 |  64 | e4m3    | e5m2   | s_q=1 + GQA + mixed fp8 out   |
+    "test_sdpa_fp8_fwd_L0[test14]": {},
+    "test_sdpa_fp8_fwd_L0[test17]": {},
+    "test_sdpa_fp8_fwd_L0[test18]": {},
+    "test_sdpa_fp8_fwd_L0[test21]": {},
+    "test_sdpa_fp8_fwd_L0[test40]": {},
+    "test_sdpa_fp8_fwd_L0[test41]": {},
+    "test_sdpa_fp8_fwd_L0[test42]": {},
+    "test_sdpa_fp8_fwd_L0[test57]": {},
+    "test_sdpa_fp8_fwd_L0[test64]": {},
+    "test_sdpa_fp8_fwd_L0[test73]": {},
+    "test_sdpa_fp8_fwd_L0[test86]": {},
+    "test_sdpa_fp8_fwd_L0[test90]": {},
+    "test_sdpa_fp8_fwd_L0[test96]": {},
+    "test_sdpa_fp8_fwd_L0[test128]": {},
+
+    # Ragged backward tests failing on Ampere (SM_80) - disallowed mismatches
+    "test_sdpa_random_bwd_ragged_L0[test2]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test13]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test40]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test41]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test59]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test60]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test66]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test72]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test91]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test96]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test111]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test116]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test126]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test131]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test133]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test136]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test139]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test144]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test145]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test153]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test155]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test162]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test163]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test166]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test188]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test192]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test213]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test218]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test220]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test235]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test237]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test238]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test241]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test243]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test247]": {"sms": ["SM_80"]},
+    "test_sdpa_random_bwd_ragged_L0[test256]": {"sms": ["SM_80"]},
+}
+
+
+def show_blocked_tests(blocked_tests, gpu_arch, cudnn_ver):
+    print(f"\n\nBlocked tests on {gpu_arch} and cudnn_ver={cudnn_ver}:")
+    if blocked_tests:
+        for index, test in enumerate(blocked_tests):
+            print(f"{index+1:<4} : {test}")
+    else:
+        print("[empty]")
+
+def fetch_blocked_tests(gpu_arch, cudnn_ver):
+    """
+    Returns a list of test names that should be blocked for the given GPU architecture
+    and cuDNN version.
+    """
+    assert type(gpu_arch) == type(cudnn_ver) == str, "expecting strings"
+    blocked_tests = []
+    for test, config in BLOCKED_TESTS.items():
+        sms = config.get("sms")
+        libs = config.get("cudnn_versions")
+        if (test not in blocked_tests) and (sms is None or gpu_arch in sms) and (libs is None or cudnn_ver in libs):
+            blocked_tests.append(test)
+    return blocked_tests
diff --git a/third_party/cudnn-frontend/test/python/sdpa/fp16.py b/third_party/cudnn-frontend/test/python/sdpa/fp16.py
new file mode 100644
index 00000000..72cb102c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/fp16.py
@@ -0,0 +1,600 @@
+import cudnn
+import pytest
+import torch
+from enum import IntEnum
+from looseversion import LooseVersion
+
+from .fp16_ref import compute_ref
+from .helpers import (
+    convert_to_cudnn_type,
+    exact_equal,
+    approx_equal,
+    alloc_tensor,
+    prefix_sum,
+    convert_packed_to_uniform,
+    convert_uniform_to_packed,
+    create_container_and_page_table,
+    time_execution,
+    profile_execution,
+)
+
+# fmt: off
+
+class TensorUid(IntEnum):
+    q = 0
+    k = 1
+    v = 2
+    o = 3
+    stats = 4
+    bias = 5
+    dQ = 6
+    dK = 7
+    dV = 8
+    dO = 9
+    dBias = 10
+    seq_len_q = 11
+    seq_len_kv = 12
+    q_ragged_offset = 13
+    k_ragged_offset = 14
+    v_ragged_offset = 15
+    o_ragged_offset = 16
+    stats_ragged_offset = 17
+    seed = 18
+    offset = 19
+    rng_dump = 20
+    block_mask = 21
+    container_k = 22
+    container_v = 23
+    page_table_k = 24
+    page_table_v = 25
+    workspace = 26
+
+
+def validate_config(cfg):
+    if not all((x > 0 and type(x) == int) for x in (cfg.batches, cfg.d_qk, cfg.d_v, cfg.s_q, cfg.s_kv, cfg.h_q, cfg.h_k, cfg.h_v)):
+       assert False, "tensor dimensions must be integer and positive"
+
+    assert cfg.shape_q == (cfg.batches, cfg.h_q, cfg.s_q, cfg.d_qk), f"wrong shape_q={cfg.shape_q}"
+    assert cfg.shape_k == (cfg.batches, cfg.h_k, cfg.s_kv, cfg.d_qk), f"wrong shape_k={cfg.shape_k}"
+    assert cfg.shape_v == (cfg.batches, cfg.h_v, cfg.s_kv, cfg.d_v), f"wrong shape_v={cfg.shape_v}"
+    assert cfg.shape_o == (cfg.batches, cfg.h_q, cfg.s_q, cfg.d_v), f"wrong shape_o={cfg.shape_o}"
+
+    if cfg.is_train:
+        assert cfg.is_paged == False and cfg.block_size == None, "paged attention not allowed in backward pass"
+
+    if cfg.is_ragged:
+        assert cfg.is_padding == True, "is_ragged=True and is_padding=False not allowed"
+
+    assert isinstance(cfg.seq_len_q, (list, tuple)), "input 'seq_len_q' must be list or tuple"
+    if cfg.is_padding:
+        assert len(cfg.seq_len_q) == cfg.batches, f"wrong 'seq_len_q' length"
+    else:
+        assert len(cfg.seq_len_q) == 0, f"wrong 'seq_len_q' length, expecting 0"
+
+    assert isinstance(cfg.seq_len_kv, (list, tuple)), "input 'seq_len_kv' must be list or tuple"
+    if cfg.is_padding:
+        assert len(cfg.seq_len_kv) == cfg.batches, f"wrong 'seq_len_kv' length, expecting {cfg.batches}"
+    else:
+        assert len(cfg.seq_len_kv) == 0, f"wrong 'seq_len_kv' length, expecting 0"
+
+    assert all(x >= 0 and type(x) == int for x in cfg.seq_len_q), f"wrong seq_len_q={cfg.seq_len_q}"
+    assert all(x >= 0 and type(x) == int for x in cfg.seq_len_kv), f"wrong seq_len_kv={cfg.seq_len_kv}"
+
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.10.0":
+        print("@@@@ Overall result: WAIVED, test_mhas_v2.py supports cudnn 9.10.0 or higher.")
+        pytest.skip("test_mhas_v2.py requires cudnn 9.10.0 or higher")
+
+    if cudnn_version < "9.13.1" and cfg.implementation == cudnn.attention_implementation.UNIFIED:
+        print("@@@@ Overall result: WAIVED, unified SDPA implementation requires cudnn 9.13.1 or higher.")
+        pytest.skip("unified SDPA implementation requires cudnn 9.13.1 or higher")
+
+    if cfg.s_q == cfg.s_kv == 1:
+        print("@@@@ Overall result: WAIVED, skipping known issue of s_q == s_kv == 1.")
+        pytest.skip("skipping known issue of s_q == s_kv == 1")
+
+
+def allocate_tensors(cfg, rng_data_gen):
+    allocs = {}
+    max_t_q = max(64, ((sum(cfg.seq_len_q) + 63) // 64) * 64) if cfg.is_ragged else None
+    max_t_kv = max(64, ((sum(cfg.seq_len_kv) + 63) // 64) * 64) if cfg.is_ragged else None
+
+    if cfg.is_ragged:
+        allocs[TensorUid.q] = alloc_tensor((max_t_q, cfg.h_q, cfg.d_qk), cfg.data_type, rng=rng_data_gen, mean=-0.5, std=1.0)
+        allocs[TensorUid.k] = alloc_tensor((max_t_kv, cfg.h_k, cfg.d_qk), cfg.data_type, rng=rng_data_gen, mean=-0.5, std=1.0)
+        allocs[TensorUid.v] = alloc_tensor((max_t_kv, cfg.h_v, cfg.d_v), cfg.data_type, rng=rng_data_gen, mean=-0.5, std=1.0)
+        allocs[TensorUid.o] = alloc_tensor((max_t_q, cfg.h_q, cfg.d_v), cfg.data_type)
+        allocs[TensorUid.stats] = alloc_tensor((max_t_q, cfg.h_q, 1), torch.float32) if cfg.is_train else (None, None, None)
+        if cfg.is_train:
+            allocs[TensorUid.dQ] = alloc_tensor((max_t_q, cfg.h_q, cfg.d_qk), cfg.data_type)
+            allocs[TensorUid.dK] = alloc_tensor((max_t_kv, cfg.h_k, cfg.d_qk), cfg.data_type)
+            allocs[TensorUid.dV] = alloc_tensor((max_t_kv, cfg.h_v, cfg.d_v), cfg.data_type)
+            allocs[TensorUid.dO] = alloc_tensor((max_t_q, cfg.h_q, cfg.d_v), cfg.data_type, rng=rng_data_gen, mean=0.0, std=0.1)
+    else:
+        allocs[TensorUid.q] = alloc_tensor(cfg.shape_q, cfg.data_type, strides=cfg.stride_q, rng=rng_data_gen, mean=-0.5, std=1.0)
+        allocs[TensorUid.k] = alloc_tensor(cfg.shape_k, cfg.data_type, strides=cfg.stride_k, rng=rng_data_gen, mean=-0.5, std=1.0)
+        allocs[TensorUid.v] = alloc_tensor(cfg.shape_v, cfg.data_type, strides=cfg.stride_v, rng=rng_data_gen, mean=-0.5, std=1.0)
+        allocs[TensorUid.o] = alloc_tensor(cfg.shape_o, cfg.data_type, strides=cfg.stride_o)
+        allocs[TensorUid.stats] = alloc_tensor((cfg.batches, cfg.h_q, cfg.s_q, 1), torch.float32) if cfg.is_train else (None, None, None)
+        if cfg.is_train:
+            allocs[TensorUid.dQ] = alloc_tensor(cfg.shape_q, cfg.data_type, strides=cfg.stride_q)
+            allocs[TensorUid.dK] = alloc_tensor(cfg.shape_k, cfg.data_type, strides=cfg.stride_k)
+            allocs[TensorUid.dV] = alloc_tensor(cfg.shape_v, cfg.data_type, strides=cfg.stride_v)
+            allocs[TensorUid.dO] = alloc_tensor(cfg.shape_o, cfg.data_type, strides=cfg.stride_o, rng=rng_data_gen, mean=0.0, std=0.1)
+
+    seq_len_q_gpu = torch.tensor(cfg.seq_len_q, dtype=torch.int32, device="cuda").view(-1, 1, 1, 1) if len(cfg.seq_len_q) > 0 else None
+    seq_len_kv_gpu = torch.tensor(cfg.seq_len_kv, dtype=torch.int32, device="cuda").view(-1, 1, 1, 1) if len(cfg.seq_len_kv) > 0 else None
+    allocs[TensorUid.seq_len_q] = (seq_len_q_gpu, None, None)
+    allocs[TensorUid.seq_len_kv] = (seq_len_kv_gpu, None, None)
+
+    if cfg.is_ragged:
+        allocs[TensorUid.q_ragged_offset] = ((prefix_sum(seq_len_q_gpu) * cfg.h_q * cfg.d_qk).to(torch.int64), None, None)
+        allocs[TensorUid.k_ragged_offset] = ((prefix_sum(seq_len_kv_gpu) * cfg.h_k * cfg.d_qk).to(torch.int64), None, None)
+        allocs[TensorUid.v_ragged_offset] = ((prefix_sum(seq_len_kv_gpu) * cfg.h_v * cfg.d_v).to(torch.int64), None, None)
+        allocs[TensorUid.o_ragged_offset] = ((prefix_sum(seq_len_q_gpu) * cfg.h_q * cfg.d_v).to(torch.int64), None, None)
+        allocs[TensorUid.stats_ragged_offset] = ((prefix_sum(seq_len_q_gpu) * cfg.h_q * 1).to(torch.int64), None, None)
+
+    if cfg.is_bias:
+        allocs[TensorUid.bias] = alloc_tensor((1, cfg.h_q, cfg.s_q, cfg.s_kv), cfg.data_type, rng=rng_data_gen, mean=0.0, std=1.0)
+    if cfg.is_train and cfg.is_bias:
+        allocs[TensorUid.dBias] = alloc_tensor((1, cfg.h_q, cfg.s_q, cfg.s_kv), cfg.data_type)
+
+    if cfg.is_block_mask:
+        TILE_M, TILE_N = 128, 128
+        block_mask_gpu = torch.randint(0, 256, (cfg.batches, cfg.h_q, (cfg.s_q + TILE_M - 1) // TILE_M, ((cfg.s_kv + TILE_N - 1) // TILE_N + 7) // 8), dtype=torch.uint8, device="cuda")
+        allocs[TensorUid.block_mask] = (block_mask_gpu, None, None)
+
+    if cfg.is_dropout:
+        allocs[TensorUid.seed] = (torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda"), None, None)
+        allocs[TensorUid.offset] = (torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda"), None, None)
+        allocs[TensorUid.rng_dump] = (torch.zeros((cfg.batches, cfg.h_q, cfg.s_q, cfg.s_kv), dtype=torch.float32, device="cuda"), None, None)
+
+    if cfg.is_paged:
+        container_k, page_table_k = create_container_and_page_table(allocs[TensorUid.k][0], cfg.block_size)
+        container_v, page_table_v = create_container_and_page_table(allocs[TensorUid.v][0], cfg.block_size)
+        allocs[TensorUid.container_k] = (container_k, None, None)
+        allocs[TensorUid.container_v] = (container_v, None, None)
+        allocs[TensorUid.page_table_k] = (page_table_k, None, None)
+        allocs[TensorUid.page_table_v] = (page_table_v, None, None)
+
+    tensors = {uid: alloc[0] for uid, alloc in allocs.items()}
+    return allocs, tensors, max_t_q, max_t_kv
+
+
+def create_forward_graph(cfg, tensors, cudnn_handle):
+    cudnn_dtype = convert_to_cudnn_type(cfg.data_type)
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+    graph = cudnn.pygraph(
+        io_data_type=cudnn_dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    q = graph.tensor(uid=int(TensorUid.q), dim=cfg.shape_q, stride=cfg.stride_q, data_type=cudnn_dtype)
+    k = graph.tensor(uid=int(TensorUid.k), dim=cfg.shape_k, stride=cfg.stride_k, data_type=cudnn_dtype)
+    v = graph.tensor(uid=int(TensorUid.v), dim=cfg.shape_v, stride=cfg.stride_v, data_type=cudnn_dtype)
+
+    page_table_k = page_table_v = paged_attention_max_seq_len_kv = None
+    if cfg.is_paged:
+        container_k_gpu = tensors.get(TensorUid.container_k)
+        container_v_gpu = tensors.get(TensorUid.container_v)
+        page_table_k_gpu = tensors.get(TensorUid.page_table_k)
+        page_table_v_gpu = tensors.get(TensorUid.page_table_v)
+        k = graph.tensor(uid=int(TensorUid.container_k), dim=container_k_gpu.size(), stride=container_k_gpu.stride(), data_type=cudnn_dtype)
+        v = graph.tensor(uid=int(TensorUid.container_v), dim=container_v_gpu.size(), stride=container_v_gpu.stride(), data_type=cudnn_dtype)
+        page_table_k = graph.tensor(uid=int(TensorUid.page_table_k), dim=page_table_k_gpu.size(), stride=page_table_k_gpu.stride(), data_type=cudnn.data_type.INT32)
+        page_table_v = graph.tensor(uid=int(TensorUid.page_table_v), dim=page_table_v_gpu.size(), stride=page_table_v_gpu.stride(), data_type=cudnn.data_type.INT32)
+        paged_attention_max_seq_len_kv = cfg.s_kv
+
+    bias = graph.tensor(uid=int(TensorUid.bias), dim=(1, cfg.h_q, cfg.s_q, cfg.s_kv), stride=(cfg.h_q * cfg.s_q * cfg.s_kv, cfg.s_q * cfg.s_kv, cfg.s_kv, 1), data_type=cudnn_dtype) if cfg.is_bias else None
+
+    TILE_M, TILE_N = 128, 128
+    block_mask_dim = (cfg.batches, cfg.h_q, (cfg.s_q + TILE_M - 1) // TILE_M, ((cfg.s_kv + TILE_N - 1) // TILE_N + 7) // 8)
+    block_mask = graph.tensor(uid=int(TensorUid.block_mask), dim=block_mask_dim, stride=(block_mask_dim[1]*block_mask_dim[2]*block_mask_dim[3], block_mask_dim[2]*block_mask_dim[3], block_mask_dim[3], 1), data_type=cudnn.data_type.UINT8) if cfg.is_block_mask else None
+
+    seq_len_q = graph.tensor(uid=int(TensorUid.seq_len_q), dim=(cfg.batches, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32) if cfg.is_padding else None
+    seq_len_kv = graph.tensor(uid=int(TensorUid.seq_len_kv), dim=(cfg.batches, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32) if cfg.is_padding else None
+
+    seed = offset = dropout_tuple = rng_dump = None
+    if cfg.is_dropout:
+        seed = graph.tensor(uid=int(TensorUid.seed), dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        offset = graph.tensor(uid=int(TensorUid.offset), dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        dropout_tuple = (cfg.dropout_prob, seed, offset)
+        rng_dump = graph.tensor(uid=int(TensorUid.rng_dump), dim=(cfg.batches, cfg.h_q, cfg.s_q, cfg.s_kv), stride=(cfg.h_q * cfg.s_q * cfg.s_kv, cfg.s_q * cfg.s_kv, cfg.s_kv, 1), data_type=cudnn.data_type.FLOAT)
+
+    q_ragged_offset = graph.tensor(uid=int(TensorUid.q_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64) if cfg.is_ragged else None
+    k_ragged_offset = graph.tensor(uid=int(TensorUid.k_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64) if cfg.is_ragged else None
+    v_ragged_offset = graph.tensor(uid=int(TensorUid.v_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64) if cfg.is_ragged else None
+    o_ragged_offset = graph.tensor(uid=int(TensorUid.o_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64) if cfg.is_ragged else None
+    stats_ragged_offset = graph.tensor(uid=int(TensorUid.stats_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64) if cfg.is_ragged and cfg.is_train else None
+
+    if cfg.is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+
+    attn_scale = 0.125
+
+    o, stats = graph.sdpa(
+        name="sdpa_forward",
+        q=q, k=k, v=v,
+        generate_stats=cfg.is_train,
+        attn_scale=attn_scale,
+        bias=bias,
+        block_mask=block_mask,
+        use_alibi_mask=cfg.is_alibi,
+        use_padding_mask=cfg.is_padding,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        diagonal_band_left_bound=cfg.left_bound,
+        diagonal_band_right_bound=cfg.right_bound,
+        diagonal_alignment=cfg.diag_align,
+        dropout=dropout_tuple,
+        rng_dump=rng_dump,
+        paged_attention_k_table=page_table_k,
+        paged_attention_v_table=page_table_v,
+        paged_attention_max_seq_len_kv=paged_attention_max_seq_len_kv,
+        implementation=cfg.implementation,
+    )
+
+    o.set_uid(int(TensorUid.o)).set_output(True).set_dim(cfg.shape_o).set_stride(cfg.stride_o)
+    if cfg.is_ragged:
+        o.set_ragged_offset(o_ragged_offset)
+
+    if cfg.is_train:
+        dim_stats = (cfg.batches, cfg.h_q, cfg.s_q, 1)
+        stride_stats = (cfg.s_q * cfg.h_q, 1, cfg.h_q, 1) if cfg.is_ragged else (cfg.h_q * cfg.s_q, cfg.s_q, 1, 1)
+        stats.set_uid(int(TensorUid.stats)).set_output(True).set_data_type(cudnn.data_type.FLOAT).set_dim(dim_stats).set_stride(stride_stats)
+        if cfg.is_ragged:
+            stats.set_ragged_offset(stats_ragged_offset)
+
+    try:
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"@@@@ Overall result: WAIVED, not supported forward graph. {e}")
+        pytest.skip("not supported forward graph")
+    except Exception as e:
+        print(f"@@@@ Overall result: FAILED, unexpected '{e.__class__.__name__}' exception during forward graph build. {e}")
+        pytest.fail("unexpected exception during forward graph build", pytrace=False)
+
+    variant_pack = {
+        int(TensorUid.q): tensors.get(TensorUid.q),
+        int(TensorUid.container_k) if cfg.is_paged else int(TensorUid.k): tensors.get(TensorUid.container_k) if cfg.is_paged else tensors.get(TensorUid.k),
+        int(TensorUid.container_v) if cfg.is_paged else int(TensorUid.v): tensors.get(TensorUid.container_v) if cfg.is_paged else tensors.get(TensorUid.v),
+        int(TensorUid.bias): tensors.get(TensorUid.bias),
+        int(TensorUid.block_mask): tensors.get(TensorUid.block_mask),
+        int(TensorUid.seq_len_q): tensors.get(TensorUid.seq_len_q),
+        int(TensorUid.seq_len_kv): tensors.get(TensorUid.seq_len_kv),
+        int(TensorUid.q_ragged_offset): tensors.get(TensorUid.q_ragged_offset),
+        int(TensorUid.k_ragged_offset): tensors.get(TensorUid.k_ragged_offset),
+        int(TensorUid.v_ragged_offset): tensors.get(TensorUid.v_ragged_offset),
+        int(TensorUid.o_ragged_offset): tensors.get(TensorUid.o_ragged_offset),
+        int(TensorUid.stats_ragged_offset): tensors.get(TensorUid.stats_ragged_offset),
+        int(TensorUid.o): tensors.get(TensorUid.o),
+        int(TensorUid.stats): tensors.get(TensorUid.stats),
+        int(TensorUid.page_table_k): tensors.get(TensorUid.page_table_k),
+        int(TensorUid.page_table_v): tensors.get(TensorUid.page_table_v),
+        int(TensorUid.seed): tensors.get(TensorUid.seed),
+        int(TensorUid.offset): tensors.get(TensorUid.offset),
+        int(TensorUid.rng_dump): tensors.get(TensorUid.rng_dump),
+    }
+    variant_pack = {k: v for k, v in variant_pack.items() if v is not None}
+
+    return graph, variant_pack
+
+
+def create_backward_graph(cfg, tensors, cudnn_handle, max_t_q, max_t_kv):
+    cudnn_dtype = convert_to_cudnn_type(cfg.data_type)
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+    sm_version = torch.cuda.get_device_capability()[0] * 10 + torch.cuda.get_device_capability()[1]
+
+    graph = cudnn.pygraph(
+        io_data_type=cudnn_dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+        sm_version=sm_version
+    )
+
+    dim_stats = (cfg.batches, cfg.h_q, cfg.s_q, 1)
+    stride_stats = (cfg.s_q * cfg.h_q, 1, cfg.h_q, 1) if cfg.is_ragged else (cfg.h_q * cfg.s_q, cfg.s_q, 1, 1)
+
+    q = graph.tensor(uid=int(TensorUid.q), dim=cfg.shape_q, stride=cfg.stride_q, data_type=cudnn_dtype)
+    k = graph.tensor(uid=int(TensorUid.k), dim=cfg.shape_k, stride=cfg.stride_k, data_type=cudnn_dtype)
+    v = graph.tensor(uid=int(TensorUid.v), dim=cfg.shape_v, stride=cfg.stride_v, data_type=cudnn_dtype)
+    o = graph.tensor(uid=int(TensorUid.o), dim=cfg.shape_o, stride=cfg.stride_o, data_type=cudnn_dtype)
+    dO = graph.tensor(uid=int(TensorUid.dO), dim=cfg.shape_o, stride=cfg.stride_o, data_type=cudnn_dtype)
+    stats = graph.tensor(uid=int(TensorUid.stats), dim=dim_stats, stride=stride_stats, data_type=cudnn.data_type.FLOAT)
+
+    bias_dim = (1, cfg.h_q, cfg.s_q, cfg.s_kv)
+    bias_stride = (cfg.h_q * cfg.s_q * cfg.s_kv, cfg.s_q * cfg.s_kv, cfg.s_kv, 1)
+    bias = graph.tensor(uid=int(TensorUid.bias), dim=bias_dim, stride=bias_stride, data_type=cudnn_dtype) if cfg.is_bias else None
+    dBias = graph.tensor(uid=int(TensorUid.dBias), dim=bias_dim, stride=bias_stride, data_type=cudnn_dtype) if cfg.is_bias else None
+
+    seq_len_q = graph.tensor(uid=int(TensorUid.seq_len_q), dim=(cfg.batches, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32) if cfg.is_padding else None
+    seq_len_kv = graph.tensor(uid=int(TensorUid.seq_len_kv), dim=(cfg.batches, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32) if cfg.is_padding else None
+
+    seed = offset = dropout_tuple = None
+    if cfg.is_dropout:
+        seed = graph.tensor(uid=int(TensorUid.seed), dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        offset = graph.tensor(uid=int(TensorUid.offset), dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        dropout_tuple = (cfg.dropout_prob, seed, offset)
+
+    attn_scale = 0.125
+
+    dQ, dK, dV = graph.sdpa_backward(
+        name="sdpa_backward",
+        q=q, k=k, v=v, o=o, dO=dO, stats=stats,
+        attn_scale=attn_scale,
+        bias=bias,
+        dBias=dBias,
+        use_alibi_mask=cfg.is_alibi,
+        use_padding_mask=cfg.is_padding,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        max_total_seq_len_q=max_t_q,
+        max_total_seq_len_kv=max_t_kv,
+        diagonal_band_left_bound=cfg.left_bound,
+        diagonal_band_right_bound=cfg.right_bound,
+        diagonal_alignment=cfg.diag_align,
+        dropout=dropout_tuple,
+        use_deterministic_algorithm=cfg.is_determin,
+    )
+
+    dQ.set_uid(int(TensorUid.dQ)).set_output(True).set_dim(cfg.shape_q).set_stride(cfg.stride_q)
+    dK.set_uid(int(TensorUid.dK)).set_output(True).set_dim(cfg.shape_k).set_stride(cfg.stride_k)
+    dV.set_uid(int(TensorUid.dV)).set_output(True).set_dim(cfg.shape_v).set_stride(cfg.stride_v)
+
+    if cfg.is_ragged:
+        q_ragged_offset = graph.tensor(uid=int(TensorUid.q_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        k_ragged_offset = graph.tensor(uid=int(TensorUid.k_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        v_ragged_offset = graph.tensor(uid=int(TensorUid.v_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        o_ragged_offset = graph.tensor(uid=int(TensorUid.o_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        stats_ragged_offset = graph.tensor(uid=int(TensorUid.stats_ragged_offset), dim=(cfg.batches + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64)
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+        o.set_ragged_offset(o_ragged_offset)
+        stats.set_ragged_offset(stats_ragged_offset)
+        dQ.set_ragged_offset(q_ragged_offset)
+        dK.set_ragged_offset(k_ragged_offset)
+        dV.set_ragged_offset(v_ragged_offset)
+        dO.set_ragged_offset(o_ragged_offset)
+
+    try:
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"@@@@ Overall result: WAIVED, not supported backward graph. {e}")
+        pytest.skip("not supported backward graph")
+    except Exception as e:
+        print(f"@@@@ Overall result: FAILED, unexpected '{e.__class__.__name__}' exception during backward graph build. {e}")
+        pytest.fail("unexpected exception during backward graph build", pytrace=False)
+
+    variant_pack = {
+        int(TensorUid.q): tensors.get(TensorUid.q),
+        int(TensorUid.k): tensors.get(TensorUid.k),
+        int(TensorUid.v): tensors.get(TensorUid.v),
+        int(TensorUid.o): tensors.get(TensorUid.o),
+        int(TensorUid.stats): tensors.get(TensorUid.stats),
+        int(TensorUid.dQ): tensors.get(TensorUid.dQ),
+        int(TensorUid.dK): tensors.get(TensorUid.dK),
+        int(TensorUid.dV): tensors.get(TensorUid.dV),
+        int(TensorUid.dO): tensors.get(TensorUid.dO),
+        int(TensorUid.bias): tensors.get(TensorUid.bias),
+        int(TensorUid.dBias): tensors.get(TensorUid.dBias),
+        int(TensorUid.seq_len_q): tensors.get(TensorUid.seq_len_q),
+        int(TensorUid.seq_len_kv): tensors.get(TensorUid.seq_len_kv),
+        int(TensorUid.q_ragged_offset): tensors.get(TensorUid.q_ragged_offset),
+        int(TensorUid.k_ragged_offset): tensors.get(TensorUid.k_ragged_offset),
+        int(TensorUid.v_ragged_offset): tensors.get(TensorUid.v_ragged_offset),
+        int(TensorUid.o_ragged_offset): tensors.get(TensorUid.o_ragged_offset),
+        int(TensorUid.stats_ragged_offset): tensors.get(TensorUid.stats_ragged_offset),
+        int(TensorUid.seed): tensors.get(TensorUid.seed),
+        int(TensorUid.offset): tensors.get(TensorUid.offset),
+    }
+    variant_pack = {k: v for k, v in variant_pack.items() if v is not None}
+
+    return graph, variant_pack
+
+
+def check_deterministic(cfg, tensors, allocs, bwd_graph, bwd_pack, cudnn_handle, request):
+    if not cfg.is_determin:
+        return
+    
+    dQ_gpu = tensors.get(TensorUid.dQ)
+    dK_gpu = tensors.get(TensorUid.dK)
+    dV_gpu = tensors.get(TensorUid.dV)
+    workspace = allocs[TensorUid.workspace]
+
+    dQ_gpu_rerun = dQ_gpu.clone().detach()
+    dK_gpu_rerun = dK_gpu.clone().detach()
+    dV_gpu_rerun = dV_gpu.clone().detach()
+
+    torch.fill_(dQ_gpu, float("nan"))
+    torch.fill_(dK_gpu, float("nan"))
+    torch.fill_(dV_gpu, float("nan"))
+    bwd_graph.execute(bwd_pack, workspace[0], cudnn_handle)
+    torch.cuda.synchronize()
+
+    determin_err_count = 0
+    determin_err_count += exact_equal(dQ_gpu, dQ_gpu_rerun, tag="dQ_determin", disp_elems=request.config.getoption("--diffs"))
+    determin_err_count += exact_equal(dK_gpu, dK_gpu_rerun, tag="dK_determin", disp_elems=request.config.getoption("--diffs"))
+    determin_err_count += exact_equal(dV_gpu, dV_gpu_rerun, tag="dV_determin", disp_elems=request.config.getoption("--diffs"))
+
+    if determin_err_count != 0:
+        print("@@@@ Overall result: FAILED, determinism check failed - outputs differ between runs.")
+        pytest.fail("determinism check failed", pytrace=False)
+    print("@@@@ Determinism check: PASSED, dQ, dK, dV bitwise match between runs.")
+
+
+def execute_graph(graph, variant_pack, allocs, tensors, cudnn_handle, request, label="Graph"):
+    workspace = alloc_tensor(graph.get_workspace_size(), torch.uint8)
+    allocs[TensorUid.workspace] = workspace
+    tensors[TensorUid.workspace] = workspace[0]
+
+    if request.config.getoption("--perf"):
+        times_ms = time_execution(graph.execute, variant_pack, workspace[0], cudnn_handle)
+        print(f"@@@@ {label} graph.execute avg_time_ms={times_ms.mean().item():.3f}")
+        profile_execution(graph.execute, variant_pack, workspace[0], cudnn_handle)
+
+    graph.execute(variant_pack, workspace[0], cudnn_handle)
+    torch.cuda.synchronize()
+
+    if workspace[1] is not None and not torch.all(workspace[1]==-1).item():
+        print(f"@@@@ Overall result: FAILED, {label} workspace overwritten outside its boundaries.")
+        print(workspace[1])
+        pytest.fail(f"{label} workspace overwritten outside boundaries", pytrace=False)
+
+
+def compute_and_compare_reference(cfg, allocs, tensors, diffs):
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+
+    q_gpu = tensors.get(TensorUid.q)
+    k_gpu = tensors.get(TensorUid.k)
+    v_gpu = tensors.get(TensorUid.v)
+    dO_gpu = tensors.get(TensorUid.dO)
+    seq_len_q_gpu = tensors.get(TensorUid.seq_len_q)
+    seq_len_kv_gpu = tensors.get(TensorUid.seq_len_kv)
+    block_mask_gpu = tensors.get(TensorUid.block_mask)
+    bias_gpu = tensors.get(TensorUid.bias)
+    rng_dump_gpu = tensors.get(TensorUid.rng_dump)
+
+    q_ref = q_gpu.detach().float()
+    k_ref = k_gpu.detach().float()
+    v_ref = v_gpu.detach().float()
+    dO_ref = dO_gpu.detach().float() if dO_gpu is not None else None
+    seq_len_q_ref = seq_len_q_gpu.flatten().detach() if seq_len_q_gpu is not None else None
+    seq_len_kv_ref = seq_len_kv_gpu.flatten().detach() if seq_len_kv_gpu is not None else None
+    block_mask_ref = block_mask_gpu.detach() if block_mask_gpu is not None else None
+    bias_ref = bias_gpu.detach().float() if bias_gpu is not None else None
+    rng_dump_ref = rng_dump_gpu.detach().float() if rng_dump_gpu is not None else None
+
+    if cfg.is_train:
+        q_ref.requires_grad_()
+        k_ref.requires_grad_()
+        v_ref.requires_grad_()
+    if cfg.is_train and cfg.is_bias:
+        bias_ref.requires_grad_()
+
+    if cfg.is_ragged:
+        q_ref = convert_packed_to_uniform(q_ref, seq_len_q_ref, cfg.s_q)
+        k_ref = convert_packed_to_uniform(k_ref, seq_len_kv_ref, cfg.s_kv)
+        v_ref = convert_packed_to_uniform(v_ref, seq_len_kv_ref, cfg.s_kv)
+    if cfg.is_ragged and cfg.is_train:
+        dO_ref = convert_packed_to_uniform(dO_ref, seq_len_q_ref, cfg.s_q)
+
+    max_t_q = max(64, ((seq_len_q_ref.sum().item() + 63) // 64) * 64) if cfg.is_ragged else None
+    max_t_kv = max(64, ((seq_len_kv_ref.sum().item() + 63) // 64) * 64) if cfg.is_ragged else None
+
+    attn_scale = 0.125
+
+    ret = compute_ref(
+        q_ref, k_ref, v_ref,
+        attn_scale=attn_scale,
+        bias=bias_ref,
+        block_mask=block_mask_ref,
+        is_alibi=cfg.is_alibi,
+        padding=(seq_len_q_ref, seq_len_kv_ref) if cfg.is_padding else None,
+        left_bound=cfg.left_bound,
+        right_bound=cfg.right_bound,
+        diag_align=cfg.diag_align,
+        dropout_prob=cfg.dropout_prob,
+        dropout_mask=rng_dump_ref,
+        generate_stats=cfg.is_train,
+    )
+
+    o_ref, stats_ref = ret if cfg.is_train else (ret, None)
+
+    o_gpu = tensors.get(TensorUid.o)
+    stats_gpu = tensors.get(TensorUid.stats)
+
+    if cfg.is_padding and not cfg.is_ragged:
+        for i, m in enumerate(seq_len_q_ref):
+            o_ref[i, :, m:, :] = 0
+            o_gpu[i, :, m:, :] = 0
+            if cfg.is_train:
+                if cudnn_version < "9.14.0":
+                    stats_ref[i, :, m:, :] = 0
+                    stats_gpu[i, :, m:, :] = 0
+                else:
+                    stats_ref[i, :, m:, :] = -float("inf")
+
+    if cfg.is_train:
+        inputs_ref = [q_ref, k_ref, v_ref, bias_ref] if cfg.is_bias else [q_ref, k_ref, v_ref]
+        grads = torch.autograd.grad(outputs=o_ref, inputs=inputs_ref, grad_outputs=dO_ref)
+        dQ_ref = grads[0]
+        dK_ref = grads[1]
+        dV_ref = grads[2]
+        dBias_ref = grads[3] if cfg.is_bias else None
+
+    if cfg.is_train and cfg.is_padding:
+        for i, (m, n) in enumerate(zip(seq_len_q_ref, seq_len_kv_ref)):
+            dQ_ref[i, :, m:, :] = 0
+            dK_ref[i, :, n:, :] = 0
+            dV_ref[i, :, n:, :] = 0
+
+    if cfg.is_ragged:
+        o_ref = convert_uniform_to_packed(o_ref, seq_len_q_ref, max_t_q)
+    if cfg.is_train and cfg.is_ragged:
+        dQ_ref = convert_uniform_to_packed(dQ_ref, seq_len_q_ref, max_t_q)
+        dK_ref = convert_uniform_to_packed(dK_ref, seq_len_kv_ref, max_t_kv)
+        dV_ref = convert_uniform_to_packed(dV_ref, seq_len_kv_ref, max_t_kv)
+        stats_ref = convert_uniform_to_packed(stats_ref, seq_len_q_ref, max_t_q)
+
+    err_count = 0
+    err_count += approx_equal(allocs[TensorUid.o], o_ref, atol=2e-2, rtol=2e-2, tag="o", disp_elems=diffs)
+    if cfg.is_train:
+        dkv_atol = 2e-2 if cfg.data_type == torch.float16 else 7e-2
+        err_count += approx_equal(allocs[TensorUid.stats], stats_ref, atol=2e-2, rtol=2e-2, tag="stats", disp_elems=diffs)
+        err_count += approx_equal(allocs[TensorUid.dQ], dQ_ref, atol=2e-2, rtol=2e-2, tag="dQ", disp_elems=diffs)
+        err_count += approx_equal(allocs[TensorUid.dK], dK_ref, atol=dkv_atol, rtol=2e-2, tag="dK", disp_elems=diffs)
+        err_count += approx_equal(allocs[TensorUid.dV], dV_ref, atol=dkv_atol, rtol=2e-2, tag="dV", disp_elems=diffs)
+    if cfg.is_train and cfg.is_bias:
+        err_count += approx_equal(allocs[TensorUid.dBias], dBias_ref, atol=2e-2, rtol=2e-2, tag="dBias", disp_elems=diffs)
+
+    if err_count != 0:
+        print("@@@@ Overall result: FAILED, disallowed mismatches")
+        pytest.fail("disallowed mismatches", pytrace=False)
+    else:
+        print("@@@@ Overall result: PASSED, everything looks good!")
+
+
+def cleanup_tensors(allocs):
+    for uid in list(allocs.keys()):
+        entry = allocs.get(uid)
+        if entry is not None and entry[0] is not None:
+            del allocs[uid]
+    torch.cuda.empty_cache()
+
+
+def exec_sdpa(cfg, request, cudnn_handle):
+    if request.config.option.dryrun:
+        pytest.skip("dry run mode")
+
+    validate_config(cfg)
+
+    rng_data_gen = torch.Generator(device="cuda").manual_seed(cfg.rng_data_seed)
+    allocs, tensors, max_t_q, max_t_kv = allocate_tensors(cfg, rng_data_gen)
+
+    fwd_graph, fwd_pack = create_forward_graph(cfg, tensors, cudnn_handle)
+    bwd_graph, bwd_pack = create_backward_graph(cfg, tensors, cudnn_handle, max_t_q, max_t_kv) if cfg.is_train else (None, None)
+
+    execute_graph(fwd_graph, fwd_pack, allocs, tensors, cudnn_handle, request, label="Forward")
+
+    if cfg.is_train:
+        execute_graph(bwd_graph, bwd_pack, allocs, tensors, cudnn_handle, request, label="Backward")
+        check_deterministic(cfg, tensors, allocs, bwd_graph, bwd_pack, cudnn_handle, request)
+
+    compute_and_compare_reference(cfg, allocs, tensors, request.config.getoption("--diffs"))
+    cleanup_tensors(allocs)
diff --git a/third_party/cudnn-frontend/test/python/sdpa/fp16_ref.py b/third_party/cudnn-frontend/test/python/sdpa/fp16_ref.py
new file mode 100644
index 00000000..a5af5b73
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/fp16_ref.py
@@ -0,0 +1,175 @@
+import torch
+import math
+import cudnn
+
+# fmt: off
+
+def compute_ref(
+    q,
+    k,
+    v,
+    attn_scale=None,
+    bias=None,
+    block_mask=None,
+    is_alibi=False,
+    padding=None,
+    diag_align=cudnn.diagonal_alignment.TOP_LEFT,
+    left_bound=None,
+    right_bound=None,
+    dropout_prob=0.0,
+    dropout_mask=None,
+    generate_stats=False,
+    device="cuda",
+):
+    b, h_q, s_q, d_qk = q.shape
+    _, h_k, s_kv, _ = k.shape
+    _, h_v, _, d_v = v.shape
+
+    assert k.shape == (b, h_k, s_kv, d_qk)
+    assert v.shape == (b, h_v, s_kv, d_v)
+
+    # use float32 datatype and math for reference computation
+    q = q.to(dtype=torch.float32, device=device)
+    k = k.to(dtype=torch.float32, device=device)
+    v = v.to(dtype=torch.float32, device=device)
+
+    # expand tensors for GQA and MQA
+    if h_q != h_k:
+        assert h_q % h_k == 0
+        k = k.unsqueeze(2)
+        k = k.expand(-1, -1, h_q // h_k, -1, -1)
+        k = k.reshape(k.size(0), -1, k.size(3), k.size(4))
+    if h_q != h_v:
+        assert h_q % h_v == 0
+        v = v.unsqueeze(2)
+        v = v.expand(-1, -1, h_q // h_v, -1, -1)
+        v = v.reshape(v.size(0), -1, v.size(3), v.size(4))
+
+    # generate masks to compute reference values for padding mask (also called variable sequence length)
+    if padding is not None:
+        q_mask = torch.zeros(b, 1, s_q, 1, dtype=torch.bool, device=device)
+        k_mask = torch.zeros(b, 1, s_kv, 1, dtype=torch.bool, device=device)
+        v_mask = torch.zeros(b, 1, s_kv, 1, dtype=torch.bool, device=device)
+        s_mask = torch.zeros(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+        p_mask = torch.zeros(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+        seq_len_q, seq_len_kv = padding
+        for i, (m, n) in enumerate(zip(seq_len_q, seq_len_kv)):
+            q_mask[i, :, m:, :] = True
+            k_mask[i, :, n:, :] = True
+            v_mask[i, :, n:, :] = True
+            s_mask[i, :, :, n:] = True
+            p_mask[i, :, m:, :] = True
+
+        q = q.masked_fill(q_mask, 0.0)
+        k = k.masked_fill(k_mask, 0.0)
+        v = v.masked_fill(v_mask, 0.0)
+
+    s = torch.einsum("bhqd,bhkd->bhqk", q, k)
+    if attn_scale is not None:
+        s = s * attn_scale
+
+    # Attention masks are applied in the following order:
+    # - Bias mask
+    # - Alibi mask
+    # - Padding mask
+    # - Causal mask
+    if bias is not None:
+        s = s + bias
+    if is_alibi:
+        index_row = torch.arange(s_q, dtype=torch.float32, device=device).view(-1, 1)
+        index_col = torch.arange(s_kv, dtype=torch.float32, device=device)
+        distance = index_col - index_row
+
+        # Get the closest power of 2 to `n_heads`.
+        # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2,
+        # and then add the remaining slopes.
+        n = 2 ** math.floor(math.log2(h_q))
+        m_0 = 2.0 ** (-8.0 / n)
+        m = torch.pow(m_0, torch.arange(1, 1 + n))
+
+        # If `n_heads` is not a power of 2, then we add the remaining slopes.
+        # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously).
+        # And pick the slopes upto `n_heads`.
+        if n < h_q:
+            m_hat_0 = 2.0 ** (-4.0 / n)
+            m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (h_q - n), 2))
+            # Concatenate the slopes with the remaining slopes.
+            m = torch.cat([m, m_hat])
+
+        # Reshape the tensor to [1, num_heads, 1, 1]
+        m = m.view(1, -1, 1, 1).to(device=device)
+
+        alibi_mask = distance.to(dtype=torch.float32) * m
+        s = s + alibi_mask
+
+    if padding is not None:
+        s = s.masked_fill(s_mask, float("-inf"))
+
+    if diag_align == diag_align.TOP_LEFT and right_bound is not None:
+        causal_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+        causal_mask.triu_(diagonal=1 + right_bound)
+        s = s.masked_fill(causal_mask, float("-inf"))
+    elif diag_align == diag_align.BOTTOM_RIGHT and right_bound is not None:
+        causal_mask_bottom_right = None
+        if padding:
+            causal_mask_bottom_right = torch.ones(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+            seq_len_q, seq_len_kv = padding
+            for i in range(b):
+                causal_mask_bottom_right[i, :, :, :].triu_(diagonal=seq_len_kv[i] - seq_len_q[i] + 1 + right_bound)
+        else:
+            causal_mask_bottom_right = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+            causal_mask_bottom_right.triu_(diagonal=s_kv - s_q + 1 + right_bound)
+        s = s.masked_fill(causal_mask_bottom_right, float("-inf"))
+
+    if left_bound is not None:
+        assert diag_align is not None
+        if diag_align == diag_align.TOP_LEFT:
+            swa_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+            swa_mask.tril_(diagonal=-1 * left_bound)
+        elif diag_align == diag_align.BOTTOM_RIGHT:
+            # BRCM + SWA for variable sequence lengths
+            if padding:
+                swa_mask = torch.ones(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+                seq_len_q, seq_len_kv = padding
+                for i in range(b):
+                    swa_mask[i, :, :, :].tril_(diagonal=seq_len_kv[i] - seq_len_q[i] - left_bound)
+            # BRCM + SWA for fixed sequence lengths
+            else:
+                swa_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+                swa_mask.tril_(diagonal=-1 * left_bound + (s_kv - s_q))
+        s = s.masked_fill(swa_mask, float("-inf"))
+
+    if block_mask is not None:
+        TILE_M = 128
+        TILE_N = 128
+
+        block_mask = block_mask.to(dtype=torch.uint8, device=device)
+        block_mask = ((block_mask[..., None] & (1 << torch.arange(8, device=block_mask.device))) != 0).reshape(block_mask.shape[0], block_mask.shape[1], block_mask.shape[2], block_mask.shape[3] * 8)
+        block_mask = block_mask.unsqueeze(3).unsqueeze(5)
+        block_mask = block_mask.repeat(1, 1, 1, TILE_M, 1, TILE_N)
+        block_mask = block_mask.reshape(block_mask.shape[0], block_mask.shape[1], block_mask.shape[2] * TILE_M, block_mask.shape[4] * TILE_N)
+        block_mask = block_mask[:, :, :s_q, :s_kv]
+        s += torch.where(block_mask, torch.tensor(0.0), torch.tensor(float('-inf')))
+
+    p = torch.softmax(s, dim=-1)
+
+    all_inf = torch.isneginf(s).all(dim=-1, keepdim=True)
+    if torch.any(all_inf):
+        p = torch.where(all_inf, torch.zeros_like(p), p)
+
+    if padding is not None:
+        p = p.masked_fill(p_mask, 0.0)
+
+    # apply dropout mask over softmax outputs
+    if dropout_prob != 0.0:
+        assert dropout_mask != None, "PyTorch reference must have dropout_mask for dropout"
+        p = (p * dropout_mask) / (1 - dropout_prob)
+
+    o = torch.einsum("bhqk,bhkd->bhqd", p, v)
+
+    # softmax stats is used for backwards computation
+    if generate_stats:
+        stats = torch.logsumexp(s, dim=-1, keepdim=True)
+        return o, stats
+
+    return o
diff --git a/third_party/cudnn-frontend/test/python/sdpa/fp8.py b/third_party/cudnn-frontend/test/python/sdpa/fp8.py
new file mode 100644
index 00000000..09dac0e8
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/fp8.py
@@ -0,0 +1,384 @@
+import cudnn
+import pytest
+import torch
+import math
+from enum import IntEnum
+from looseversion import LooseVersion
+
+from .fp8_ref import compute_ref
+from .helpers import get_fp8_scale_factor, get_fp8_descale_factor, convert_to_cudnn_type
+
+# fmt: off
+
+class GraphFwdUid(IntEnum):
+    q = 0
+    k = 1
+    v = 2
+    q_descale = 5
+    k_descale = 6
+    v_descale = 7
+    s_scale = 9
+    s_descale = 8
+    o_scale = 10
+    o = 3
+    stats = 4
+    s_amax = 11
+    o_amax = 12
+    kv_seq_len = 13
+    q_seq_len = 14
+    k_block_table = 15
+    v_block_table = 16
+
+class GraphBwdUid(IntEnum):
+    q = 100
+    k = 101
+    v = 102
+    o = 103
+    dO = 104
+    stats = 105
+    q_descale = 106
+    k_descale = 107
+    v_descale = 108
+    o_descale = 109
+    dO_descale = 110
+    s_descale = 111
+    dP_descale = 112
+    s_scale = 113
+    dQ_scale = 114
+    dK_scale = 115
+    dV_scale = 116
+    dP_scale = 117
+    dQ = 118
+    dK = 119
+    dV = 120
+    dQ_amax = 121
+    dK_amax = 122
+    dV_amax = 123
+    dP_amax = 124
+
+def generate_graph_fwd(cudnn_itype, cudnn_otype, b, h_q, h_k, h_v, s_qo, s_kv, d_qk, d_vo, attn_scale, block_size):
+    graph_fwd = cudnn.pygraph(io_data_type=cudnn_itype, intermediate_data_type=cudnn.data_type.FLOAT, compute_data_type=cudnn.data_type.FLOAT)
+
+    use_padding_mask = None
+    kv_seq_len = None
+    q_seq_len = None
+    k_block_table = None
+    v_block_table = None
+
+    if block_size == 0:
+        q = graph_fwd.tensor(uid=GraphFwdUid.q, dim=(b, h_q, s_qo, d_qk), stride=(s_qo * h_q * d_qk, d_qk, h_q * d_qk, 1), data_type=cudnn_itype)
+        k = graph_fwd.tensor(uid=GraphFwdUid.k, dim=(b, h_k, s_kv, d_qk), stride=(s_kv * h_k * d_qk, d_qk, h_k * d_qk, 1), data_type=cudnn_itype)
+        v = graph_fwd.tensor(uid=GraphFwdUid.v, dim=(b, h_v, s_kv, d_vo), stride=(s_kv * h_v * d_vo, d_vo, h_v * d_vo, 1), data_type=cudnn_itype)
+    else:
+        table_size = math.ceil(s_kv / block_size)
+        num_blocks = table_size * b
+
+        q = graph_fwd.tensor(uid=GraphFwdUid.q, dim=(b, h_q, s_qo, d_qk), stride=(s_qo * h_q * d_qk, d_qk, h_q * d_qk, 1), data_type=cudnn_itype)
+        k = graph_fwd.tensor(uid=GraphFwdUid.k, dim=(num_blocks, h_k, block_size, d_qk), stride=(block_size * h_k * d_qk, block_size * d_qk, d_qk, 1), data_type=cudnn_itype)
+        v = graph_fwd.tensor(uid=GraphFwdUid.v, dim=(num_blocks, h_v, block_size, d_vo), stride=(block_size * h_v * d_vo, block_size * d_vo, d_vo, 1), data_type=cudnn_itype)
+
+        use_padding_mask = True
+        kv_seq_len = graph_fwd.tensor(uid=GraphFwdUid.kv_seq_len, dim=(b, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32)
+        q_seq_len = graph_fwd.tensor(uid=GraphFwdUid.q_seq_len, dim=(b, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32)
+        k_block_table = graph_fwd.tensor(uid=GraphFwdUid.k_block_table, dim=(b, 1, table_size, 1), stride=(table_size, table_size, 1, 1), data_type=cudnn.data_type.INT32)
+        v_block_table = graph_fwd.tensor(uid=GraphFwdUid.v_block_table, dim=(b, 1, table_size, 1), stride=(table_size, table_size, 1, 1), data_type=cudnn.data_type.INT32)
+
+    q_descale = graph_fwd.tensor(uid=GraphFwdUid.q_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    k_descale = graph_fwd.tensor(uid=GraphFwdUid.k_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    v_descale = graph_fwd.tensor(uid=GraphFwdUid.v_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    s_scale = graph_fwd.tensor(uid=GraphFwdUid.s_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    s_descale = graph_fwd.tensor(uid=GraphFwdUid.s_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    o_scale = graph_fwd.tensor(uid=GraphFwdUid.o_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+
+    o, stats, amax_s, amax_o = graph_fwd.sdpa_fp8(
+        q=q, k=k, v=v,
+        descale_q=q_descale, descale_k=k_descale, descale_v=v_descale,
+        scale_s=s_scale, descale_s=s_descale, scale_o=o_scale,
+        generate_stats=True, attn_scale=attn_scale, use_causal_mask=False,
+        use_padding_mask=use_padding_mask, seq_len_kv=kv_seq_len, seq_len_q=q_seq_len,
+        paged_attention_k_table=k_block_table, paged_attention_v_table=v_block_table,
+        paged_attention_max_seq_len_kv=s_kv,
+    )
+
+    o.set_uid(GraphFwdUid.o).set_output(True).set_dim((b, h_q, s_qo, d_vo)).set_stride((s_qo * h_q * d_vo, d_vo, h_q * d_vo, 1)).set_data_type(cudnn_otype)
+    stats.set_uid(GraphFwdUid.stats).set_output(True).set_dim((b, h_q, s_qo, 1)).set_stride((s_qo * h_q, s_qo, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+    amax_s.set_uid(GraphFwdUid.s_amax).set_output(True).set_dim((1, 1, 1, 1)).set_stride((1, 1, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+    amax_o.set_uid(GraphFwdUid.o_amax).set_output(True).set_dim((1, 1, 1, 1)).set_stride((1, 1, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+
+    return graph_fwd
+
+def generate_graph_bwd(cudnn_itype, cudnn_otype, b, h_q, h_k, h_v, s_qo, s_kv, d_qk, d_vo, attn_scale, deterministic):
+    graph_bwd = cudnn.pygraph(io_data_type=cudnn_itype, intermediate_data_type=cudnn.data_type.FLOAT, compute_data_type=cudnn.data_type.FLOAT)
+
+    q = graph_bwd.tensor(uid=GraphBwdUid.q, dim=(b, h_q, s_qo, d_qk), stride=(s_qo * h_q * d_qk, d_qk, h_q * d_qk, 1), data_type=cudnn_itype)
+    k = graph_bwd.tensor(uid=GraphBwdUid.k, dim=(b, h_k, s_kv, d_qk), stride=(s_kv * h_k * d_qk, d_qk, h_k * d_qk, 1), data_type=cudnn_itype)
+    v = graph_bwd.tensor(uid=GraphBwdUid.v, dim=(b, h_v, s_kv, d_vo), stride=(s_kv * h_v * d_vo, d_vo, h_v * d_vo, 1), data_type=cudnn_itype)
+    o = graph_bwd.tensor(uid=GraphBwdUid.o, dim=(b, h_q, s_qo, d_vo), stride=(s_qo * h_q * d_vo, d_vo, h_q * d_vo, 1), data_type=cudnn_otype)
+    dO = graph_bwd.tensor(uid=GraphBwdUid.dO, dim=(b, h_q, s_qo, d_vo), stride=(s_qo * h_q * d_vo, d_vo, h_q * d_vo, 1), data_type=cudnn_itype)
+    stats = graph_bwd.tensor(uid=GraphBwdUid.stats, dim=(b, h_q, s_qo, 1), stride=(s_qo * h_q, s_qo, 1, 1), data_type=cudnn.data_type.FLOAT)
+
+    q_descale = graph_bwd.tensor(uid=GraphBwdUid.q_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    k_descale = graph_bwd.tensor(uid=GraphBwdUid.k_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    v_descale = graph_bwd.tensor(uid=GraphBwdUid.v_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    o_descale = graph_bwd.tensor(uid=GraphBwdUid.o_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    dO_descale = graph_bwd.tensor(uid=GraphBwdUid.dO_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    s_descale = graph_bwd.tensor(uid=GraphBwdUid.s_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    dP_descale = graph_bwd.tensor(uid=GraphBwdUid.dP_descale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+
+    s_scale = graph_bwd.tensor(uid=GraphBwdUid.s_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    dQ_scale = graph_bwd.tensor(uid=GraphBwdUid.dQ_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    dK_scale = graph_bwd.tensor(uid=GraphBwdUid.dK_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    dV_scale = graph_bwd.tensor(uid=GraphBwdUid.dV_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+    dP_scale = graph_bwd.tensor(uid=GraphBwdUid.dP_scale, dim=(1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.FLOAT)
+
+    dQ, dK, dV, amax_dQ, amax_dK, amax_dV, amax_dP = graph_bwd.sdpa_fp8_backward(
+        q=q, k=k, v=v, o=o, dO=dO, stats=stats,
+        descale_q=q_descale, descale_k=k_descale, descale_v=v_descale,
+        descale_o=o_descale, descale_dO=dO_descale, descale_s=s_descale, descale_dP=dP_descale,
+        scale_s=s_scale, scale_dQ=dQ_scale, scale_dK=dK_scale, scale_dV=dV_scale, scale_dP=dP_scale,
+        attn_scale=attn_scale, use_padding_mask=False, use_deterministic_algorithm=deterministic,
+    )
+
+    dQ.set_uid(GraphBwdUid.dQ).set_output(True).set_dim((b, h_q, s_qo, d_qk)).set_stride((s_qo * h_q * d_qk, d_qk, h_q * d_qk, 1)).set_data_type(cudnn_itype)
+    dK.set_uid(GraphBwdUid.dK).set_output(True).set_dim((b, h_k, s_kv, d_qk)).set_stride((s_kv * h_k * d_qk, d_qk, h_k * d_qk, 1)).set_data_type(cudnn_itype)
+    dV.set_uid(GraphBwdUid.dV).set_output(True).set_dim((b, h_v, s_kv, d_vo)).set_stride((s_kv * h_v * d_vo, d_vo, h_v * d_vo, 1)).set_data_type(cudnn_itype)
+
+    amax_dQ.set_uid(GraphBwdUid.dQ_amax).set_output(True).set_dim((1, 1, 1, 1)).set_stride((1, 1, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+    amax_dK.set_uid(GraphBwdUid.dK_amax).set_output(True).set_dim((1, 1, 1, 1)).set_stride((1, 1, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+    amax_dV.set_uid(GraphBwdUid.dV_amax).set_output(True).set_dim((1, 1, 1, 1)).set_stride((1, 1, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+    amax_dP.set_uid(GraphBwdUid.dP_amax).set_output(True).set_dim((1, 1, 1, 1)).set_stride((1, 1, 1, 1)).set_data_type(cudnn.data_type.FLOAT)
+
+    return graph_bwd
+
+def create_paged_container_and_block_table(tensor, block_size):
+    B, H, S, D = tensor.shape
+    blocks_per_batch = math.ceil(S / block_size)
+
+    padding_seq = blocks_per_batch * block_size - S
+    if padding_seq > 0:
+        zeros = torch.zeros(B, H, padding_seq, D, device="cuda", dtype=tensor.dtype)
+        cat_tensor = torch.cat((tensor, zeros), dim=2)
+    else:
+        cat_tensor = tensor
+
+    container = torch.cat(cat_tensor.chunk(blocks_per_batch, dim=2), dim=0)
+
+    table_size = math.ceil(S / block_size)
+    block_table_temp = torch.linspace(0, B * table_size - 1, B * table_size, device="cuda", dtype=torch.int32).reshape(table_size, 1, B, 1)
+    block_table_temp = torch.transpose(block_table_temp, 0, 2)
+
+    block_table = (torch.zeros(blocks_per_batch * B, device="cuda", dtype=torch.int32).as_strided((B, 1, blocks_per_batch, 1), (blocks_per_batch, blocks_per_batch, 1, 1)))
+    block_table.copy_(block_table_temp)
+
+    return (container, block_table)
+
+def exec_sdpa_fp8(cfg, request, cudnn_handle):
+    if request.config.option.dryrun:
+        pytest.skip("dryrun")
+
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.14.0":
+        pytest.skip("SDPA FP8 requires cuDNN 9.14.0 or higher")
+    if torch.cuda.get_device_capability()[0] < 10:
+        pytest.skip("SDPA FP8 requires Blackwell or higher")
+
+    torch_itype = cfg.data_type
+    torch_otype = cfg.output_type if hasattr(cfg, 'output_type') and cfg.output_type else cfg.data_type
+    cudnn_itype = convert_to_cudnn_type(torch_itype)
+    cudnn_otype = convert_to_cudnn_type(torch_otype)
+
+    b = cfg.batches
+    h_q, h_k, h_v = cfg.h_q, cfg.h_k, cfg.h_v
+    s_qo, s_kv = cfg.s_q, cfg.s_kv
+    d_qk, d_vo = cfg.d_qk, cfg.d_v
+    block_size = cfg.block_size if cfg.is_paged else 0
+    deterministic = cfg.is_determin if hasattr(cfg, 'is_determin') else False
+
+    attn_scale = 0.125
+
+    is_paged = block_size > 0
+
+    try:
+        if cfg.is_infer:
+            graph = generate_graph_fwd(cudnn_itype, cudnn_otype, b, h_q, h_k, h_v, s_qo, s_kv, d_qk, d_vo, attn_scale, block_size)
+        else:
+            graph = generate_graph_bwd(cudnn_itype, cudnn_otype, b, h_q, h_k, h_v, s_qo, s_kv, d_qk, d_vo, attn_scale, deterministic)
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(f"unsupported graph: {e}")
+    except Exception as e:
+        pytest.fail(f"Error building graph: {e}")
+
+    rng_data = torch.Generator(device="cuda").manual_seed(cfg.rng_data_seed)
+
+    q_gen = torch.clamp(torch.randn(b, s_qo, h_q, d_qk, dtype=torch.float, device="cuda", generator=rng_data), min=-2.0, max=2.0)
+    k_gen = torch.clamp(torch.randn(b, s_kv, h_k, d_qk, dtype=torch.float, device="cuda", generator=rng_data), min=-2.0, max=2.0)
+    v_gen = torch.clamp(torch.randn(b, s_kv, h_v, d_vo, dtype=torch.float, device="cuda", generator=rng_data), min=-2.0, max=2.0)
+
+    q_amax = q_gen.abs().max().item()
+    k_amax = k_gen.abs().max().item()
+    v_amax = v_gen.abs().max().item()
+    s_amax, o_amax = compute_ref(q_gen, k_gen, v_gen, attn_scale, return_type="amax")
+
+    q_gpu = (q_gen * get_fp8_scale_factor(q_amax, torch_itype)).to(torch_itype)
+    k_gpu = (k_gen * get_fp8_scale_factor(k_amax, torch_itype)).to(torch_itype)
+    v_gpu = (v_gen * get_fp8_scale_factor(v_amax, torch_itype)).to(torch_itype)
+
+    if cfg.is_infer:
+        if is_paged:
+            k_gpu_bhsd = torch.einsum('bshd->bhsd', k_gpu).contiguous()
+            v_gpu_bhsd = torch.einsum('bshd->bhsd', v_gpu).contiguous()
+            container_k_gpu, k_block_table_gpu = create_paged_container_and_block_table(k_gpu_bhsd, block_size)
+            container_v_gpu, v_block_table_gpu = create_paged_container_and_block_table(v_gpu_bhsd, block_size)
+
+        kv_seq_len_gpu = torch.full((b, 1, 1, 1), s_kv, device="cuda", dtype=torch.int32)
+        q_seq_len_gpu = torch.full((b, 1, 1, 1), s_qo, device="cuda", dtype=torch.int32)
+        o_gpu = torch.full((b, s_qo, h_q, d_vo), float('nan'), dtype=torch_otype, device="cuda")
+        stats_gpu = torch.full((b, h_q, s_qo, 1), float('nan'), dtype=torch.float, device="cuda")
+
+        q_descale_gpu = torch.tensor([get_fp8_descale_factor(q_amax, torch_itype)], dtype=torch.float, device="cuda")
+        k_descale_gpu = torch.tensor([get_fp8_descale_factor(k_amax, torch_itype)], dtype=torch.float, device="cuda")
+        v_descale_gpu = torch.tensor([get_fp8_descale_factor(v_amax, torch_itype)], dtype=torch.float, device="cuda")
+        s_scale_gpu = torch.tensor([get_fp8_scale_factor(s_amax, torch_itype)], dtype=torch.float, device="cuda")
+        s_descale_gpu = torch.tensor([get_fp8_descale_factor(s_amax, torch_itype)], dtype=torch.float, device="cuda")
+        o_scale_gpu = torch.tensor([get_fp8_scale_factor(o_amax, torch_otype)], dtype=torch.float, device="cuda")
+
+        s_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+        o_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+
+        variant_pack = {
+            int(GraphFwdUid.q): q_gpu,
+            int(GraphFwdUid.k): k_gpu,
+            int(GraphFwdUid.v): v_gpu,
+            int(GraphFwdUid.q_descale): q_descale_gpu,
+            int(GraphFwdUid.k_descale): k_descale_gpu,
+            int(GraphFwdUid.v_descale): v_descale_gpu,
+            int(GraphFwdUid.s_descale): s_descale_gpu,
+            int(GraphFwdUid.s_scale): s_scale_gpu,
+            int(GraphFwdUid.o_scale): o_scale_gpu,
+            int(GraphFwdUid.o): o_gpu,
+            int(GraphFwdUid.stats): stats_gpu,
+            int(GraphFwdUid.s_amax): s_amax_gpu,
+            int(GraphFwdUid.o_amax): o_amax_gpu,
+        }
+
+        if is_paged:
+            variant_pack[int(GraphFwdUid.k)] = container_k_gpu
+            variant_pack[int(GraphFwdUid.v)] = container_v_gpu
+            variant_pack[int(GraphFwdUid.kv_seq_len)] = kv_seq_len_gpu
+            variant_pack[int(GraphFwdUid.q_seq_len)] = q_seq_len_gpu
+            variant_pack[int(GraphFwdUid.k_block_table)] = k_block_table_gpu
+            variant_pack[int(GraphFwdUid.v_block_table)] = v_block_table_gpu
+
+        workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+        graph.execute(variant_pack, workspace, handle=cudnn_handle)
+        torch.cuda.synchronize()
+
+        q_ref = q_gpu.detach().float() * get_fp8_descale_factor(q_amax, torch_itype)
+        k_ref = k_gpu.detach().float() * get_fp8_descale_factor(k_amax, torch_itype)
+        v_ref = v_gpu.detach().float() * get_fp8_descale_factor(v_amax, torch_itype)
+        o_ref = compute_ref(q_ref, k_ref, v_ref, attn_scale=attn_scale)
+
+        o_gpu_comp = o_gpu.detach().float() * get_fp8_descale_factor(o_amax, torch_otype)
+
+        atol, rtol = 0.08, 0.2
+        if torch_itype == torch.float8_e5m2:
+            atol, rtol = 0.16, 0.4
+
+        torch.testing.assert_close(o_gpu_comp, o_ref, atol=atol, rtol=rtol)
+
+    else:
+        dO_gen = torch.clamp(torch.randn(b, s_qo, h_q, d_vo, dtype=torch.float, device="cuda", generator=rng_data), min=-2.0, max=2.0)
+        dO_amax = dO_gen.abs().max().item()
+
+        q_gpu = q_gen.to(torch_itype)
+        k_gpu = k_gen.to(torch_itype)
+        v_gpu = v_gen.to(torch_itype)
+
+        graph_fwd = generate_graph_fwd(cudnn_itype, cudnn_otype, b, h_q, h_k, h_v, s_qo, s_kv, d_qk, d_vo, attn_scale, 0)
+        graph_fwd.validate(); graph_fwd.build_operation_graph()
+        graph_fwd.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph_fwd.check_support(); graph_fwd.build_plans()
+
+        o_gpu = torch.full((b, s_qo, h_q, d_vo), float('nan'), dtype=torch_otype, device="cuda")
+        stats_gpu = torch.full((b, h_q, s_qo, 1), float('nan'), dtype=torch.float, device="cuda")
+        dO_gpu = dO_gen.to(torch_itype)
+
+        q_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        k_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        v_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        s_scale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        s_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        o_scale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        s_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+        o_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+
+        variant_pack_fwd = {
+            int(GraphFwdUid.q): q_gpu, int(GraphFwdUid.k): k_gpu, int(GraphFwdUid.v): v_gpu,
+            int(GraphFwdUid.q_descale): q_descale_gpu, int(GraphFwdUid.k_descale): k_descale_gpu,
+            int(GraphFwdUid.v_descale): v_descale_gpu, int(GraphFwdUid.s_descale): s_descale_gpu,
+            int(GraphFwdUid.s_scale): s_scale_gpu, int(GraphFwdUid.o_scale): o_scale_gpu,
+            int(GraphFwdUid.o): o_gpu, int(GraphFwdUid.stats): stats_gpu,
+            int(GraphFwdUid.s_amax): s_amax_gpu, int(GraphFwdUid.o_amax): o_amax_gpu,
+        }
+
+        workspace_fwd = torch.empty(graph_fwd.get_workspace_size(), dtype=torch.uint8, device="cuda")
+        graph_fwd.execute(variant_pack_fwd, workspace_fwd, handle=cudnn_handle)
+        torch.cuda.synchronize()
+
+        o_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        dO_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        dP_descale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        dQ_scale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        dK_scale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        dV_scale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+        dP_scale_gpu = torch.tensor([1.0], dtype=torch.float, device="cuda")
+
+        dQ_gpu = torch.full((b, s_qo, h_q, d_qk), float('nan'), dtype=torch_itype, device="cuda")
+        dK_gpu = torch.full((b, s_kv, h_k, d_qk), float('nan'), dtype=torch_itype, device="cuda")
+        dV_gpu = torch.full((b, s_kv, h_v, d_vo), float('nan'), dtype=torch_itype, device="cuda")
+        dQ_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+        dK_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+        dV_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+        dP_amax_gpu = torch.tensor([float('nan')], dtype=torch.float, device="cuda")
+
+        variant_pack_bwd = {
+            int(GraphBwdUid.q): q_gpu, int(GraphBwdUid.k): k_gpu, int(GraphBwdUid.v): v_gpu,
+            int(GraphBwdUid.o): o_gpu, int(GraphBwdUid.dO): dO_gpu, int(GraphBwdUid.stats): stats_gpu,
+            int(GraphBwdUid.q_descale): q_descale_gpu, int(GraphBwdUid.k_descale): k_descale_gpu,
+            int(GraphBwdUid.v_descale): v_descale_gpu, int(GraphBwdUid.o_descale): o_descale_gpu,
+            int(GraphBwdUid.dO_descale): dO_descale_gpu, int(GraphBwdUid.s_descale): s_descale_gpu,
+            int(GraphBwdUid.s_scale): s_scale_gpu, int(GraphBwdUid.dP_descale): dP_descale_gpu,
+            int(GraphBwdUid.dP_scale): dP_scale_gpu, int(GraphBwdUid.dQ_scale): dQ_scale_gpu,
+            int(GraphBwdUid.dK_scale): dK_scale_gpu, int(GraphBwdUid.dV_scale): dV_scale_gpu,
+            int(GraphBwdUid.dQ): dQ_gpu, int(GraphBwdUid.dK): dK_gpu, int(GraphBwdUid.dV): dV_gpu,
+            int(GraphBwdUid.dQ_amax): dQ_amax_gpu, int(GraphBwdUid.dK_amax): dK_amax_gpu,
+            int(GraphBwdUid.dV_amax): dV_amax_gpu, int(GraphBwdUid.dP_amax): dP_amax_gpu,
+        }
+
+        workspace_bwd = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+        graph.execute(variant_pack_bwd, workspace_bwd, handle=cudnn_handle)
+        torch.cuda.synchronize()
+
+        q_ref = q_gpu.detach().float()
+        k_ref = k_gpu.detach().float()
+        v_ref = v_gpu.detach().float()
+
+        q_ref.requires_grad_(True)
+        k_ref.requires_grad_(True)
+        v_ref.requires_grad_(True)
+        o_tmp = compute_ref(q_ref, k_ref, v_ref, attn_scale=attn_scale)
+        dQ_ref, dK_ref, dV_ref = torch.autograd.grad(outputs=o_tmp, inputs=[q_ref, k_ref, v_ref], grad_outputs=dO_gen)
+
+        dQ_out = dQ_gpu.detach().float()
+        dK_out = dK_gpu.detach().float()
+        dV_out = dV_gpu.detach().float()
+
+        atol, rtol = 0.16, 0.2
+        torch.testing.assert_close(dQ_out, dQ_ref, atol=atol, rtol=rtol)
+        torch.testing.assert_close(dK_out, dK_ref, atol=atol, rtol=rtol)
+        torch.testing.assert_close(dV_out, dV_ref, atol=atol, rtol=rtol)
diff --git a/third_party/cudnn-frontend/test/python/sdpa/fp8_ref.py b/third_party/cudnn-frontend/test/python/sdpa/fp8_ref.py
new file mode 100644
index 00000000..deecf36b
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/fp8_ref.py
@@ -0,0 +1,29 @@
+import torch
+
+# fmt: off
+
+def compute_ref(q, k, v, attn_scale=1.0, return_type="o"):
+    b, s_q, h_q, d_qk = q.shape
+    _, s_kv, h_k, _ = k.shape
+    _, _, h_v, d_v = v.shape
+
+    assert k.shape == (b, s_kv, h_k, d_qk)
+    assert v.shape == (b, s_kv, h_v, d_v)
+
+    if h_q != h_k:
+        k = k.repeat_interleave(h_q // h_k, dim=2)
+    if h_q != h_v:
+        v = v.repeat_interleave(h_q // h_v, dim=2)
+
+    s = torch.einsum("bqhd,bkhd->bhqk", q, k) * attn_scale
+    p = s.softmax(dim=-1)
+    o = torch.einsum("bhqk,bkhd->bqhd", p, v)
+
+    if return_type == "o":
+        return o
+    if return_type == "o_stats":
+        return o, torch.zeros()
+    elif return_type == "amax":
+        return p.abs().max().item(), o.abs().max().item()
+    else:
+        raise ValueError(f"Unsupported return type: {return_type}")
diff --git a/third_party/cudnn-frontend/test/python/sdpa/helpers.py b/third_party/cudnn-frontend/test/python/sdpa/helpers.py
new file mode 100644
index 00000000..71998f0c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/helpers.py
@@ -0,0 +1,255 @@
+import cudnn
+import torch
+import math
+
+# fmt: off
+
+def get_fp8_largest_po2(dtype: torch.dtype):
+    if dtype == torch.float8_e4m3fn:
+        return 128.0
+    elif dtype == torch.float8_e5m2:
+        return 32768.0
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+def get_fp8_scale_factor(amax: float, dtype: torch.dtype, fudge_factor: float = 0.25, epsilon = 0.0625):
+    if dtype == torch.float16 or dtype == torch.bfloat16:
+        return 1.0
+    po2_next = 2 ** math.ceil(math.log2(max(amax, epsilon)))
+    return get_fp8_largest_po2(dtype) / po2_next * fudge_factor
+
+def get_fp8_descale_factor(amax: float, dtype: torch.dtype, fudge_factor: float = 0.25, epsilon = 0.0625):
+    return 1.0 / get_fp8_scale_factor(amax, dtype, fudge_factor, epsilon)
+
+def compute_total_elems(shape, strides):
+    """Compute total element count (max offset + 1) from shape and strides."""
+    return sum((s - 1) * st for s, st in zip(shape, strides)) + 1
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    elif torch_type == torch.float8_e4m3fn:
+        return cudnn.data_type.FP8_E4M3
+    elif torch_type == torch.float8_e5m2:
+        return cudnn.data_type.FP8_E5M2
+    else:
+        assert False, "unsupported tensor data type"
+
+def alloc_tensor(shape, data_type, *, elems=None, strides=None, rng=None, mean=0.0, std=1.0, margins=512):
+    if strides is None:
+        # Compute default contiguous strides
+        if hasattr(shape, '__iter__'):
+            strides = []
+            prod = 1
+            for dim in reversed(shape):
+                strides.insert(0, prod)
+                prod *= int(dim)
+            if elems is None:
+                elems = prod
+        else:
+            if elems is None:
+                elems = int(shape)
+            strides = (1,)
+            shape = (shape,)
+    elif elems is None:
+        elems = compute_total_elems(shape, strides)
+
+    assert margins >= 0 and type(margins) == int, "wrong input"
+
+    rawbuf = torch.empty(elems+2*margins, dtype=data_type, device="cuda")
+    if torch.is_floating_point(rawbuf):
+        rawbuf.fill_(float('nan'))
+    else:
+        rawbuf.fill_(-1)
+
+    tensor = torch.as_strided(rawbuf, shape, strides, storage_offset=margins)
+    sepbuf = (torch.as_strided(rawbuf, (2, margins), (elems+margins, 1), storage_offset=0) if margins > 0 else None)
+
+    if rng is not None:
+        tensor.normal_(mean=mean, std=std, generator=rng)
+
+    if math.prod(shape) == elems:
+        rawbuf = None
+
+    return tensor, sepbuf, rawbuf
+
+def prefix_sum(t):
+    t = t.flatten()
+    return torch.cat((torch.zeros(1, dtype=t.dtype, device=t.device), torch.cumsum(t, dim=0)))
+
+def convert_packed_to_uniform(packed_tensor, seq_len, s_max, fill_value=0):
+    assert packed_tensor.dim() == 3
+    t, h, d = packed_tensor.size()
+    seq_len = seq_len.flatten()
+    b = seq_len.size(0)
+
+    uniform_tensor = torch.full((b, s_max, h, d), fill_value, dtype=packed_tensor.dtype, device=packed_tensor.device)
+
+    t_idx = 0
+    for bi, s in enumerate(seq_len):
+        uniform_tensor[bi, 0:s, :, :] = packed_tensor[t_idx : t_idx + s, :, :]
+        t_idx += s
+
+    uniform_tensor = torch.einsum("bshd->bhsd", uniform_tensor)
+    return uniform_tensor
+
+def convert_uniform_to_packed(uniform_tensor, seq_len, max_t):
+    assert uniform_tensor.dim() == 4
+    uniform_tensor = torch.einsum("bhsd->bshd", uniform_tensor)
+    b, s, h, d = uniform_tensor.size()
+    seq_len = seq_len.flatten()
+    assert seq_len.size(0) == b
+    packed_tensor = torch.full((max_t, h, d), float('nan'), dtype=uniform_tensor.dtype, device=uniform_tensor.device)
+
+    t_idx = 0
+    for bi, s_len in enumerate(seq_len):
+        packed_tensor[t_idx : t_idx + s_len, :, :] = uniform_tensor[bi, 0:s_len, :, :]
+        t_idx += s_len
+
+    return packed_tensor
+
+def create_container_and_page_table(tensor, block_size):
+    B, H, S, D = tensor.shape
+    blocks_per_batch = math.ceil(S/block_size)
+
+    padding_seq = (blocks_per_batch * block_size) - S
+    if padding_seq > 0:
+        zeros = torch.zeros(B,H,padding_seq,D, device='cuda', dtype=tensor.dtype)
+        cat_tensor = torch.cat((tensor, zeros), axis = 2)
+    else:
+        cat_tensor = tensor
+
+    reshaped = torch.cat((cat_tensor.clone()).chunk(blocks_per_batch, dim=2), dim=0)
+
+    table_size = math.ceil(S/block_size)
+    page_table = torch.linspace(0, B*table_size-1, B*table_size, device='cuda', dtype=torch.int32).reshape(table_size,1,B,1)
+    page_table = torch.transpose(page_table,0,2)
+
+    return(reshaped, page_table)
+
+def exact_equal(actual, expected, tag, disp_elems):
+    both_nan = torch.isnan(actual) & torch.isnan(expected)
+    mismatches = torch.where((actual != expected) & ~both_nan)
+    mismatch_cnt = mismatches[0].numel()
+    num_elements = torch.numel(actual)
+    if mismatch_cnt != 0:
+        percentage = 100 * mismatch_cnt / num_elements
+        if disp_elems > 0:
+            print(f"Comparing '{tag}' for exact (bitwise) equality")
+            combined = torch.stack(mismatches, dim=-1).tolist()
+            count = 0
+            for index in combined:
+                diff = actual[tuple(index)].float() - expected[tuple(index)].float()
+                print(f"idx{index}: {tag}_run1={actual[tuple(index)]}, {tag}_run2={expected[tuple(index)]}, diff={diff:+.2e}")
+                count += 1
+                if count >= disp_elems:
+                    break
+            print(f"%%%% Total {mismatch_cnt:,} mismatches ({percentage:.1f}%) when validating '{tag}' for exact equality (first {count} mismatches displayed)")
+        else:
+            print(f"%%%% Total {mismatch_cnt:,} mismatches ({percentage:.1f}%) when validating '{tag}' for exact equality")
+    else:
+        print(f"%%%% Exact (bitwise) equality of '{tag}' verified")
+    return mismatch_cnt
+
+def approx_equal(alloc, expected, atol, rtol, tag, disp_elems):
+    actual, sepbuf, rawbuf = alloc
+    mismatches = torch.where(torch.isclose(actual.float(), expected, rtol=rtol, atol=atol, equal_nan=True) == False)
+    mismatch_cnt = mismatches[0].numel()
+    num_elements = torch.numel(actual)
+    if mismatch_cnt != 0:
+        percentage = 100 * mismatch_cnt / num_elements
+        if disp_elems > 0:
+            print(f"Comparing '{tag}' using rtol={rtol:.4e}, atol={atol:.4e}")
+            combined = torch.stack(mismatches, dim=-1).tolist()
+            count = 0
+            for index in combined:
+                diff = actual[tuple(index)] - expected[tuple(index)]
+                if math.isfinite(diff):
+                    print(f"idx{index}: {tag}_gpu={actual[tuple(index)]:+.6e}, {tag}_ref={expected[tuple(index)]:+.6e}, diff={diff:+.2e}")
+                else:
+                    print(f"idx{index}: {tag}_gpu={actual[tuple(index)]:+.6e}, {tag}_ref={expected[tuple(index)]:+.6e}")
+                count += 1
+                if count >= disp_elems:
+                    break
+            print(f"%%%% Total {mismatch_cnt:,} mismatches ({percentage:.1f}%) when validating '{tag}' results (first {count} mismatches displayed)")
+        else:
+            print(f"%%%% Total {mismatch_cnt:,} mismatches ({percentage:.1f}%) when validating '{tag}' results")
+
+        num_nans       = torch.isnan(actual).sum().item()
+        num_infs       = torch.isinf(actual).sum().item()
+        num_zeros      = num_elements - torch.count_nonzero(actual)
+        num_finites_nz = num_elements - num_nans - num_infs - num_zeros
+
+        print(f"%%%% {tag}_gpu overview: elements={num_elements:,}, finites_nz={num_finites_nz:,}, zeros={num_zeros:,}, nans={num_nans:,}, infs={num_infs:,}")
+
+        num_nans       = torch.isnan(expected).sum().item()
+        num_infs       = torch.isinf(expected).sum().item()
+        num_zeros      = num_elements - torch.count_nonzero(expected)
+        num_finites_nz = num_elements - num_nans - num_infs - num_zeros
+
+        print(f"%%%% {tag}_ref overview: elements={num_elements:,}, finites_nz={num_finites_nz:,}, zeros={num_zeros:,}, nans={num_nans:,}, infs={num_infs:,}")
+    else:
+        print(f"%%%% Numerical divergence of '{tag}' within limits")
+
+    if sepbuf is not None and not torch.all(torch.isnan(sepbuf)).item():
+        print(f"%%%% Buffer '{tag}' overwritten outside its boundaries")
+        print(sepbuf)
+        mismatch_cnt += 1
+
+    if rawbuf is not None:
+        actual.fill_(float('nan'))
+        if not torch.all(torch.isnan(rawbuf)).item():
+            print(f"%%%% Unused gaps of '{tag}' tensor were overwritten")
+            mismatch_cnt += 1
+
+    return mismatch_cnt
+
+def time_execution(fn, *args, num_warmup: int = 3, num_trials: int = 10) -> torch.Tensor:
+    elapsed_times = torch.zeros(num_trials, dtype=torch.float)
+    for _ in range(num_warmup):
+        fn(*args)
+        torch.cuda.synchronize()
+    for i in range(num_trials):
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        fn(*args)
+        end_event.record()
+        torch.cuda.synchronize()
+        elapsed_times[i] = start_event.elapsed_time(end_event)
+    return elapsed_times
+
+def profile_execution(fn, *args, trace_dir=None):
+    activities = [torch.profiler.ProfilerActivity.CUDA]
+    if trace_dir:
+        activities.append(torch.profiler.ProfilerActivity.CPU)
+    with torch.profiler.profile(
+        activities=activities,
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True,
+        on_trace_ready=(torch.profiler.tensorboard_trace_handler(trace_dir) if trace_dir else None),
+    ) as prof:
+        fn(*args)
+        torch.cuda.synchronize()
+    print("Sorted by CUDA time:")
+    print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=10))
+    print()
+    if torch.profiler.ProfilerActivity.CPU in activities:
+        print("Sorted by CPU time:")
+        print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))
+        print()
+
+def print_section_begin(msg, width=80):
+    print(f" {msg} ".center(width, "="))
+
+def print_section_end(width=80):
+    print("=" * width)
diff --git a/third_party/cudnn-frontend/test/python/sdpa/random_config.py b/third_party/cudnn-frontend/test/python/sdpa/random_config.py
new file mode 100644
index 00000000..9c6205cf
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/sdpa/random_config.py
@@ -0,0 +1,551 @@
+import random
+import math
+import torch
+from typing import Any, List, Optional
+import cudnn
+
+from dataclasses import dataclass, field, asdict
+
+# fmt: off
+
+def generate_test_seeds(*, num_tests, rng_seed):
+    rng = random.Random(rng_seed)
+    return [(i+1, num_tests, rng.randint(65536, 2147483647)) for i in range(num_tests)]
+
+
+def get_strides_from_indices(shape, indices=[0, 1, 2, 3], gaps=[0, 0, 0, 0], rng_geom=None):
+    """Compute strides for a given dimension order and optional gaps."""
+    assert len(shape) == len(gaps) == 4 and sorted(indices) == [0, 1, 2, 3] and indices[3] == 3, "wrong input"
+    strides = [0, 0, 0, 1]  # d should always have stride 1
+    curr_stride = 1
+
+    for i in range(3, 0, -1):
+        j = indices[i]
+        curr_stride = (shape[j] + gaps[j]) * curr_stride
+        j = indices[i - 1]
+        strides[j] = curr_stride
+
+        # Corrupt strides upwards intentionally for dim=1.
+        if rng_geom is not None and shape[j] == 1:
+            strides[j] = max(strides[j], rng_geom.choice([0, 3331333, 99990001]))
+
+    return tuple(strides)
+
+
+def get_strides_from_layout(shape, layout, gaps=[0, 0, 0, 0], rng_geom=None):
+    """Compute strides for a given layout string (e.g. 'bshd', 'bhsd')."""
+    assert "".join(sorted(layout)) == "bdhs", f"wrong layout '{layout}'"
+    indices = ["bhsd".index(ch) for ch in layout]
+    return get_strides_from_indices(shape, indices, gaps, rng_geom)
+
+
+def compute_default_BHSD_strides(shape):
+    """Compute default BHSD strides (rightmost dim is innermost with stride=1, no gaps)."""
+    if shape is None:
+        return None
+    strides = [1] * len(shape)
+    for i in range(len(shape) - 2, -1, -1):
+        strides[i] = strides[i + 1] * shape[i + 1]
+    return tuple(strides)
+
+
+def compute_packed_strides(shape):
+    """Compute packed (ragged) BSHD strides for BHSD shape: (s*h*d, d, h*d, 1)."""
+    if shape is None:
+        return None
+    b, h, s, d = shape
+    return (s * h * d, d, h * d, 1)
+
+
+@dataclass
+class ExecConfig:
+    # Registry for enum-like fields: field_name -> module/class to getattr from
+    _ENUM_FIELDS = {
+        'data_type': torch,
+        'output_type': torch,
+        'diag_align': cudnn.diagonal_alignment,
+        'implementation': cudnn.attention_implementation,
+    }
+
+    data_type: torch.dtype = None
+    output_type: torch.dtype = None
+    rng_geom_seed: int = None
+    rng_data_seed: int = None
+
+    is_alibi: bool = None
+    is_infer: bool = True
+    is_paged: bool = False
+    is_bias: bool = None
+    is_block_mask: bool = None
+    is_padding: bool = None
+    is_ragged: bool = None
+    is_dropout: bool = None
+    is_determin: bool = None
+
+    diag_align: cudnn.diagonal_alignment = None
+    left_bound: int = None
+    right_bound: int = None
+
+    batches: int = None
+    d_qk: int = None
+    d_v: int = None
+    s_q: int = None
+    s_kv: int = None
+    h_q: int = None
+    h_k: int = None
+    h_v: int = None
+    block_size: int = None
+
+    # in_layout    : str = None
+    # out_layout   : str = None
+
+    shape_q: tuple[int, int, int, int] = None
+    stride_q: tuple[int, int, int, int] = None
+
+    shape_k: tuple[int, int, int, int] = None
+    stride_k: tuple[int, int, int, int] = None
+
+    shape_v: tuple[int, int, int, int] = None
+    stride_v: tuple[int, int, int, int] = None
+
+    shape_o: tuple[int, int, int, int] = None
+    stride_o: tuple[int, int, int, int] = None
+
+    seq_len_q: list[int] = field(default_factory=list)
+    seq_len_kv: list[int] = field(default_factory=list)
+
+    dropout_prob: float = 0.0
+
+    implementation: cudnn.attention_implementation = cudnn.attention_implementation.AUTO
+
+    @property
+    def is_train(self):
+        return not self.is_infer
+
+    def fill_derived_fields(self):
+        """
+        Fill in derived fields (shapes, strides) from basic dims.
+        - Shapes are computed from basic dims (batches, h_q/k/v, s_q/kv, d_qk/v)
+        - Strides default to BHSD layout if not provided
+        """
+        # Compute shapes from basic dims if not provided
+        if self.shape_q is None and all(x is not None for x in [self.batches, self.h_q, self.s_q, self.d_qk]):
+            self.shape_q = (self.batches, self.h_q, self.s_q, self.d_qk)
+        if self.shape_k is None and all(x is not None for x in [self.batches, self.h_k, self.s_kv, self.d_qk]):
+            self.shape_k = (self.batches, self.h_k, self.s_kv, self.d_qk)
+        if self.shape_v is None and all(x is not None for x in [self.batches, self.h_v, self.s_kv, self.d_v]):
+            self.shape_v = (self.batches, self.h_v, self.s_kv, self.d_v)
+        if self.shape_o is None and all(x is not None for x in [self.batches, self.h_q, self.s_q, self.d_v]):
+            self.shape_o = (self.batches, self.h_q, self.s_q, self.d_v)
+
+        # Compute strides if not provided (packed for ragged, default BHSD otherwise)
+        stride_fn = compute_packed_strides if self.is_ragged else compute_default_BHSD_strides
+        if self.stride_q is None and self.shape_q is not None:
+            self.stride_q = stride_fn(self.shape_q)
+        if self.stride_k is None and self.shape_k is not None:
+            self.stride_k = stride_fn(self.shape_k)
+        if self.stride_v is None and self.shape_v is not None:
+            self.stride_v = stride_fn(self.shape_v)
+        if self.stride_o is None and self.shape_o is not None:
+            self.stride_o = stride_fn(self.shape_o)
+
+    def serialize(self) -> dict:
+        """Convert config to a serializable dict for repro commands."""
+        cfg_dict = asdict(self)
+        for field, enum_cls in self._ENUM_FIELDS.items():
+            if cfg_dict.get(field) is not None:
+                val = cfg_dict[field]
+                if hasattr(val, 'name'):
+                    module_prefix = enum_cls.__module__.split('.')[0]
+                    cfg_dict[field] = f"{module_prefix}.{enum_cls.__name__}.{val.name}"
+                else:
+                    cfg_dict[field] = str(val)
+        return cfg_dict
+
+    @classmethod
+    def deserialize(cls, d: dict) -> "ExecConfig":
+        """Create ExecConfig from a serialized dict."""
+        for field, enum_cls in cls._ENUM_FIELDS.items():
+            if d.get(field) is not None:
+                name = d[field].split('.')[-1]
+                assert hasattr(enum_cls, name), f"Invalid {field}: {name}"
+                d[field] = getattr(enum_cls, name)
+        cfg = cls(**d)
+        cfg.fill_derived_fields()
+        return cfg
+
+    def to_repro_cmd(self, test_file: str) -> str:
+        """Generate a readable multi-line repro command with aligned backslashes."""
+        cfg_dict = self.serialize()
+        indent = " " * 4
+        # Build lines without trailing backslash first
+        lines = [
+            "pytest -vv -s -rA --no-header --tb=short",
+            f"{indent}{test_file}::test_repro",
+            f"{indent}--repro \"",
+            f"{indent}{indent}" + "{",
+        ]
+        items = list(cfg_dict.items())
+        for i, (k, v) in enumerate(items):
+            comma = "," if i < len(items) - 1 else ""
+            lines.append(f"{indent}{indent}{indent}'{k}': {repr(v)}{comma}")
+        lines.append(f"{indent}{indent}" + "}")
+        lines.append(f'{indent}"')
+        # Find max length and align backslashes (except last line)
+        max_len = max(len(line) for line in lines[:-1])
+        aligned = [f"{line:<{max_len}} \\" for line in lines[:-1]]
+        aligned.append(lines[-1])
+        return "\n".join(aligned)
+
+
+class RandomizationContext:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+    def __call__(self, rng, rng_data_seed, rng_geom_seed=None):
+
+        randoms_ = ExecConfig()
+
+        randoms = {}
+
+        randoms_.rng_geom_seed = rng_geom_seed
+        randoms_.rng_data_seed = rng_data_seed
+
+        randoms["rng_data_seed"] = rng_data_seed
+
+        self.rng_data = torch.Generator(device="cuda").manual_seed(rng_data_seed)
+
+        randoms = {k: v(rng) for k, v in self.kwargs.items() if not hasattr(randoms_, k)}
+        [setattr(randoms_, k, v(rng)) for k, v in self.kwargs.items() if hasattr(randoms_, k)]
+
+        if "is_deterministic" in randoms:
+            randoms_.is_determin = randoms["is_deterministic"] == True
+
+        if "is_bias" in randoms:
+            randoms_.is_bias = randoms["is_bias"] == True
+
+        randoms_.s_q, randoms_.s_kv = randoms["s_q_s_kv"]
+        randoms_.d_qk, randoms_.d_v = randoms["d_qk_d_v"]
+        randoms_.h_q, randoms_.h_k, randoms_.h_v = randoms["head_count"]
+
+        randoms_.is_ragged = randoms["is_q_ragged_or_padded_or_full"] == "ragged"
+        randoms_.is_padding = randoms["is_q_ragged_or_padded_or_full"] == "padded" or randoms["is_q_ragged_or_padded_or_full"] == "ragged"
+
+        if randoms["is_q_ragged_or_padded_or_full"] != "full":
+            # ~10% chance of 0-length sequence for each batch
+            randoms_.seq_len_q = [0 if rng.random() < 0.1 else rng.randint(1, randoms_.s_q) for _ in range(randoms_.batches)]
+            # ~10% chance of 0-length sequence for each batch (independent of seq_len_q)
+            randoms_.seq_len_kv = [
+                # 0 if rng.random() < 0.1 else rng.randint(randoms_.seq_len_q[i], randoms_.s_kv) for i in range(randoms_.batches)
+                rng.randint(1, randoms_.s_kv)
+                for i in range(randoms_.batches)
+            ]
+
+        # Decide the left and right bounds for the sliding window mask (None = no bound)
+        randoms_.left_bound = None
+        randoms_.right_bound = None
+
+        if randoms["with_sliding_mask"] == "no_mask":
+            pass  # left_bound and right_bound stay None
+        elif randoms["with_sliding_mask"] == "left_window_only":
+            randoms_.left_bound = rng.randint(1, max(1, randoms_.s_kv // 2))
+            randoms_.right_bound = 0
+        elif randoms["with_sliding_mask"] == "right_window_only":
+            randoms_.left_bound = None if randoms_.diag_align == cudnn.diagonal_alignment.BOTTOM_RIGHT else 1
+            randoms_.right_bound = rng.randint(0, randoms_.s_kv // 2)
+        elif randoms["with_sliding_mask"] == "band_around_diag":
+            randoms_.left_bound = rng.randint(1, randoms_.s_kv // 2)
+            randoms_.right_bound = rng.randint(1, randoms_.s_kv // 2)
+        elif randoms["with_sliding_mask"] == "causal":
+            randoms_.right_bound = 0
+
+        elem_align = int(16 / randoms_.data_type.itemsize)
+        # Decide Q, O, Stats
+        randoms_.shape_q = (randoms_.batches, randoms_.h_q, randoms_.s_q, randoms_.d_qk)
+        randoms_.shape_o = (randoms_.batches, randoms_.h_q, randoms_.s_q, randoms_.d_v)
+
+        if randoms_.is_ragged:  # Ideally Q ragged and O ragged
+            randoms_.stride_q = get_strides_from_layout(randoms_.shape_q, "bshd")
+            randoms_.stride_o = get_strides_from_layout(randoms_.shape_o, "bshd")
+
+        else:
+            indices = [0, 1, 2]
+            rng.shuffle(indices)
+            indices.append(3)
+            gaps_q = [0, 0, 0, 0]
+            gaps_o = [0, 0, 0, 0]
+
+            if rng.randint(0, 1) == 0:  # 50% chance of gaps
+                gaps_q = [rng.randint(0, 8) for _ in range(3)]
+                gaps_o = [rng.randint(0, 8) for _ in range(3)]
+                gaps_q.append(elem_align * rng.randint(0, 2))
+                gaps_o.append(elem_align * rng.randint(0, 2))
+
+            randoms_.stride_q = get_strides_from_indices(randoms_.shape_q, indices, gaps_q, rng)
+            randoms_.stride_o = get_strides_from_indices(randoms_.shape_o, indices, gaps_o, rng)
+
+        # Decide K, V
+        randoms_.shape_k = (
+            randoms_.batches,
+            randoms_.h_k,
+            randoms_.s_kv,
+            randoms_.d_qk,
+        )
+        randoms_.shape_v = (randoms_.batches, randoms_.h_v, randoms_.s_kv, randoms_.d_v)
+
+        if randoms_.is_ragged:  # Ideally K ragged and V ragged
+            randoms_.stride_k = get_strides_from_layout(randoms_.shape_k, "bshd")
+            randoms_.stride_v = get_strides_from_layout(randoms_.shape_v, "bshd")
+
+        else:
+            indices = [0, 1, 2]
+            rng.shuffle(indices)
+            indices.append(3)
+            gaps_k = [0, 0, 0, 0]
+            gaps_v = [0, 0, 0, 0]
+
+            if rng.randint(0, 1) == 0:  # 50% chance of gaps
+                gaps_k = [rng.randint(0, 8) for _ in range(3)]
+                gaps_v = [rng.randint(0, 8) for _ in range(3)]
+                gaps_k.append(elem_align * rng.randint(0, 2))
+                gaps_v.append(elem_align * rng.randint(0, 2))
+
+            randoms_.stride_k = get_strides_from_indices(randoms_.shape_k, indices, gaps_k, rng)
+            randoms_.stride_v = get_strides_from_indices(randoms_.shape_v, indices, gaps_v, rng)
+
+        return randoms_
+
+
+class RandomChoice:
+    def __init__(self, choices: dict[Any, int]):
+        self.choices = choices
+        self.total_weight = sum(choices.values())
+
+    def __call__(self, rng):
+        dice = rng.randint(0, self.total_weight - 1)
+        for k, v in self.choices.items():
+            if dice < v:
+                return k
+            dice -= v
+
+
+class RandomIntValue:
+    def __init__(
+        self,
+        min: int,
+        max: int,
+        multiple_of: Optional[int] = None,
+        power_of_two: Optional[bool] = False,
+        with_high_probability: Optional[List[int]] = None,
+    ):
+        self.min = min
+        self.max = max
+        self.multiple_of = multiple_of
+        self.power_of_two = power_of_two
+        self.with_high_probability = with_high_probability
+
+    def __call__(self, rng):
+        # 50% chance to use the with_high_probability list
+        dice = rng.randint(0, 1) if self.with_high_probability is not None else 0
+        if self.power_of_two:
+            # calculate the smallest and largest powers of two in range, then sample
+            min_exp = math.ceil(math.log2(self.min))
+            max_exp = math.floor(math.log2(self.max))
+            if min_exp > max_exp:
+                raise ValueError(f"No power of two in range [{self.min}, {self.max}]")
+            exp = rng.randint(min_exp, max_exp) if dice == 0 else self.with_high_probability[rng.randint(0, len(self.with_high_probability) - 1)]
+            return 1 << exp if dice == 0 else exp
+        elif self.multiple_of:
+            # compute the first and last valid multiples, then pick randomly
+            first = ((self.min + self.multiple_of - 1) // self.multiple_of) * self.multiple_of
+            last = (self.max // self.multiple_of) * self.multiple_of
+            if first > self.max:
+                raise ValueError(f"No multiples of {self.multiple_of} in range [{self.min}, {self.max}]")
+            count = ((last - first) // self.multiple_of) + 1
+            idx = rng.randint(0, count - 1) if dice == 0 else self.with_high_probability[rng.randint(0, len(self.with_high_probability) - 1)]
+            return first + idx * self.multiple_of
+        else:
+            return rng.randint(self.min, self.max) if dice == 0 else self.with_high_probability[rng.randint(0, len(self.with_high_probability) - 1)]
+
+
+class RandomHeadGenerator:
+    def __init__(self, min: int, max: int, head_group_options: tuple[int, int, int]):
+        self.min = min
+        self.max = max
+        self.head_group_options = head_group_options
+
+    def __call__(self, rng):
+        sum_weights = sum(self.head_group_options)
+        dice = rng.randint(0, sum_weights - 1)
+        if dice < self.head_group_options[0]:  # MHA case
+            h_q = h_k = h_v = rng.randint(self.min, self.max)
+        elif dice < self.head_group_options[0] + self.head_group_options[1]:  # GQA case
+            h_q = rng.randint(self.min, self.max)
+            divisors = (lambda h: [i for i in range(1, h + 1) if h % i == 0])(h_q)
+            group_count = rng.choice(divisors)
+            h_k = h_v = h_q // group_count
+        else:
+            h_q = rng.randint(self.min, self.max)
+            h_k = 1
+            h_v = 1
+
+        return h_q, h_k, h_v
+
+
+class SlidingWindowMaskGenerator:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+    def __call__(self, rng):
+        sum_weights = sum(self.kwargs.values())
+        dice = rng.randint(0, sum_weights - 1)
+        for k, v in self.kwargs.items():
+            if dice < v:
+                return k
+            dice -= v
+
+
+class RandomHiddenDimSize:
+    def __init__(
+        self,
+        d_qk_min: int,
+        d_qk_max: int,
+        d_v_min: int,
+        d_v_max: int,
+        head_dim_distribution: dict[Any, int],
+        with_high_probability: Optional[List[tuple[int, int]]] = None,
+    ):
+
+        self.d_qk_gen = RandomIntValue(min=d_qk_min, max=d_qk_max, multiple_of=8)
+        self.d_v_gen = RandomIntValue(min=d_v_min, max=d_v_max, multiple_of=8)
+        self.distribution = RandomChoice(head_dim_distribution)
+        self.with_high_probability = with_high_probability
+
+    def __call__(self, rng):
+
+        dice = rng.randint(0, 1) if self.with_high_probability is not None else 0
+
+        if dice == 0:
+            d_qk = self.d_qk_gen(rng)
+            d_v = self.d_v_gen(rng)
+            distribution = self.distribution(rng)
+
+            if distribution == "d_qk=d_v":
+                d_v = d_qk
+
+            if d_qk < d_v:
+                d_qk = d_v
+        else:
+            d_qk, d_v = self.with_high_probability[rng.randint(0, len(self.with_high_probability) - 1)]
+
+        return d_qk, d_v
+
+
+class RandomSequenceLength:
+    def __init__(
+        self,
+        s_q_min: int,
+        s_q_max: int,
+        s_kv_min: int,
+        s_kv_max: int,
+        s_q_distribution: dict[Any, int],
+    ):
+        self.s_q_gen = RandomIntValue(min=s_q_min, max=s_q_max)
+        self.s_kv_gen = RandomIntValue(min=s_kv_min, max=s_kv_max)
+        self.distribution = RandomChoice(s_q_distribution)
+
+    def __call__(self, rng):
+        s_q = self.s_q_gen(rng)
+        s_kv = self.s_kv_gen(rng)
+
+        distribution = self.distribution(rng)
+
+        if distribution == "s_q=1":
+            s_q = 1
+        elif distribution == "s_q=s_kv":
+            s_q = s_kv
+        else:
+            s_q = self.s_q_gen(rng)
+
+            # Always s_q <=s_kv
+            if s_q > s_kv:
+                s_q = s_kv
+        return s_q, s_kv
+
+
+class RandomBatchSize(RandomIntValue):
+    def __init__(self, min: int, max: int, with_high_probability: Optional[List[int]] = None):
+        super().__init__(min, max, with_high_probability=with_high_probability)
+
+    def __call__(self, rng):
+        return super().__call__(rng)
+
+
+class RandomBlockSize(RandomIntValue):
+    def __init__(self, min: int, max: int, with_high_probability: Optional[List[int]] = None):
+        super().__init__(min, max, with_high_probability=with_high_probability, power_of_two=True)
+
+    def __call__(self, rng):
+        return super().__call__(rng)
+
+
+def test_randomization_context(seed):
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1, 4]),
+        s_q_s_kv=RandomSequenceLength(
+            s_q_min=1,
+            s_q_max=1024,
+            s_kv_min=1,
+            s_kv_max=512,
+            s_q_distribution={"s_q=1": 0, "s_q=s_kv": 5, "s_q=random": 10},
+        ),
+        d_qk_d_v=RandomHiddenDimSize(
+            d_qk_min=1,
+            d_qk_max=128,
+            d_v_min=1,
+            d_v_max=128,
+            head_dim_distribution={"d_qk=d_v": 1, "d_qk=random": 1},
+            with_high_probability=[(128, 128), (192, 128)],
+        ),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16: 1, torch.bfloat16: 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(
+            causal=10,
+            left_window_only=5,
+            right_window_only=5,
+            band_around_diag=10,
+            no_mask=10,
+        ),
+        diag_align=RandomChoice(
+            {
+                cudnn.diagonal_alignment.TOP_LEFT: 1,
+                cudnn.diagonal_alignment.BOTTOM_RIGHT: 1,
+            }
+        ),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged": 1, "padded": 1, "full": 1}),
+        is_kv_ragged_or_paged_or_padded_or_full=RandomChoice({"ragged": 1, "paged": 1, "padded": 1, "full": 1}),
+        stats_layout=RandomChoice({"ragged": 1, "full": 1, "disabled": 2}),
+    ) as ctx:
+        return ctx
+
+
+if __name__ == "__main__":
+    num_tests = 10
+    seed = 768
+    ctx = test_randomization_context(seed=seed)
+    for i in range(num_tests):
+        rng = random.Random(seed + i)
+        random_value = ctx(rng, seed + i)
+        print(f"{i}: {random_value}")
diff --git a/third_party/cudnn-frontend/test/python/test_apply_rope.py b/third_party/cudnn-frontend/test/python/test_apply_rope.py
new file mode 100644
index 00000000..95fb12a1
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_apply_rope.py
@@ -0,0 +1,125 @@
+import cudnn
+import torch
+import pytest
+
+from test_utils import torch_fork_set_rng
+
+
+def build_rope_cache(
+    seq_len: int,
+    n_elem: int,
+    device="cuda",
+    base: int = 10000,
+    condense_ratio: int = 1,
+):
+    """Enhanced Transformer with Rotary Position Embedding.
+
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device) / n_elem))
+
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta).repeat(1, 2)
+
+    cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)
+
+    return cos, sin
+
+
+def apply_rope_ref(q: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    def fn(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        head_size = x.size(-1)
+        x1 = x[..., : head_size // 2]  # (B, nh, T, hs/2)
+        x2 = x[..., head_size // 2 :]  # (B, nh, T, hs/2)
+        rotated = torch.cat((-x2, x1), dim=-1)  # (B, nh, T, hs)
+        roped = (x * cos) + (rotated * sin)
+        return roped.type_as(x)
+
+    rope_n_elem = cos.size(-1)
+    q_roped = fn(q[..., :rope_n_elem], cos, sin)
+    return torch.cat((q_roped, q[..., rope_n_elem:]), dim=-1)
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+def create_rope_graph(handle, x1_gpu, x2_gpu, cos1_gpu, cos2_gpu, sin1_gpu, sin2_gpu):
+    with cudnn.graph(
+        handle,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    ) as (g, _):
+        x1 = g.tensor_like(x1_gpu)
+        x2 = g.tensor_like(x2_gpu)
+        cos1 = g.tensor_like(cos1_gpu)
+        cos2 = g.tensor_like(cos2_gpu)
+        sin1 = g.tensor_like(sin1_gpu)
+        sin2 = g.tensor_like(sin2_gpu)
+
+        x1_cos1 = g.mul(a=x1, b=cos1)
+        x2_cos2 = g.mul(a=x2, b=cos2)
+        x2_sin1 = g.mul(a=x2, b=sin1)
+        x1_sin2 = g.mul(a=x1, b=sin2)
+
+        Y1 = g.sub(a=x1_cos1, b=x2_sin1)
+        Y1.set_output(True).set_data_type(torch.float16)
+
+        Y2 = g.add(a=x2_cos2, b=x1_sin2)
+        Y2.set_output(True).set_data_type(torch.float16)
+
+        return g, [x1, x2, sin1, sin2, cos1, cos2, Y1, Y2]
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_apply_rope(cudnn_handle):
+    B, nh, T, hs = 8, 32, 4096, 128
+    rope_n_elem = int(0.25 * hs)
+
+    # Reference
+    x_gpu = torch.randn(B, nh, T, hs, dtype=torch.float16, device="cuda")
+    cos_gpu, sin_gpu = build_rope_cache(seq_len=T, n_elem=rope_n_elem)
+    Y_expected = apply_rope_ref(x_gpu, cos_gpu, sin_gpu)
+
+    # Prepare inputs
+    x_gpu_3d = x_gpu.reshape(-1, T, hs)
+    x1_gpu = x_gpu_3d[..., : rope_n_elem // 2]
+    x2_gpu = x_gpu_3d[..., rope_n_elem // 2 : rope_n_elem]
+
+    cos_gpu = cos_gpu.reshape(1, T, rope_n_elem)
+    cos1_gpu = cos_gpu[..., : rope_n_elem // 2]
+    cos2_gpu = cos_gpu[..., rope_n_elem // 2 :]
+
+    sin_gpu = sin_gpu.reshape(1, T, rope_n_elem)
+    sin1_gpu = sin_gpu[..., : rope_n_elem // 2]
+    sin2_gpu = sin_gpu[..., rope_n_elem // 2 :]
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    g, uids = create_rope_graph(cudnn_handle, x1_gpu, x2_gpu, cos1_gpu, cos2_gpu, sin1_gpu, sin2_gpu)
+    x1_uid, x2_uid, sin1_uid, sin2_uid, cos1_uid, cos2_uid, Y1_uid, Y2_uid = uids
+
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute(
+        {
+            x1_uid: x1_gpu,
+            x2_uid: x2_gpu,
+            sin1_uid: sin1_gpu,
+            sin2_uid: sin2_gpu,
+            cos1_uid: cos1_gpu,
+            cos2_uid: cos2_gpu,
+            Y1_uid: x1_gpu,
+            Y2_uid: x2_gpu,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, x_gpu, atol=1e-2, rtol=1e-2)
diff --git a/third_party/cudnn-frontend/test/python/test_batchnorm.py b/third_party/cudnn-frontend/test/python/test_batchnorm.py
new file mode 100644
index 00000000..0169104a
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_batchnorm.py
@@ -0,0 +1,375 @@
+import cudnn
+import pytest
+import torch
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.8",
+    reason="BN with mask output not supported below cudnn 8.8",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_bn_relu_with_mask(cudnn_handle):
+    n, c, h, w = 4, 16, 56, 56
+    input_type = torch.float16
+
+    epsilon_value = 1e-3
+    momentum_value = 1e-1
+
+    # input tensors
+    x_gpu = torch.randn(n, c, h, w, dtype=input_type, device="cuda")
+    x_gpu = x_gpu.to(memory_format=torch.channels_last)
+    scale_gpu = torch.randn(1, c, 1, 1, device="cuda")
+    bias_gpu = torch.randn_like(scale_gpu)
+    running_mean_gpu = torch.randn_like(scale_gpu)
+    running_var_gpu = torch.randn_like(scale_gpu)
+
+    comparison_gpu = torch.zeros_like(x_gpu, dtype=input_type, device="cuda")
+
+    epsilon_cpu = torch.full((1, 1, 1, 1), epsilon_value)
+    momentum_cpu = torch.full((1, 1, 1, 1), momentum_value)
+
+    # output tensors
+    saved_mean_gpu = torch.empty_like(running_mean_gpu, device="cuda")
+    saved_inv_var_gpu = torch.empty_like(running_var_gpu, device="cuda")
+    y_gpu = torch.empty_like(x_gpu, dtype=input_type, device="cuda")
+    mask_gpu = torch.empty_like(x_gpu, dtype=torch.bool, device="cuda")
+
+    # cudnn graph
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(input_type),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    x = graph.tensor_like(x_gpu)
+    scale = graph.tensor_like(scale_gpu)
+    bias = graph.tensor_like(bias_gpu)
+
+    in_running_mean = graph.tensor_like(running_mean_gpu)
+    in_running_var = graph.tensor_like(running_var_gpu)
+    epsilon = graph.tensor_like(epsilon_cpu)
+    momentum = graph.tensor_like(momentum_cpu)
+    comparison = graph.tensor_like(x_gpu)
+
+    y_before_relu, saved_mean, saved_inv_var, out_running_mean, out_running_var = graph.batchnorm(
+        name="BN",
+        input=x,
+        scale=scale,
+        bias=bias,
+        in_running_mean=in_running_mean,
+        in_running_var=in_running_var,
+        epsilon=epsilon,
+        momentum=momentum,
+    )
+    y = graph.relu(name="relu", input=y_before_relu)
+    mask = graph.cmp_gt(name="cmp", input=y, comparison=comparison)
+
+    y.set_output(True)
+    saved_mean.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    saved_inv_var.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    out_running_mean.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    out_running_var.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    mask.set_output(True).set_data_type(cudnn.data_type.BOOLEAN)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans()
+
+    # cudnn graph execution
+    variant_pack = {
+        x: x_gpu,
+        scale: scale_gpu,
+        bias: bias_gpu,
+        in_running_mean: running_mean_gpu,
+        in_running_var: running_var_gpu,
+        epsilon: epsilon_cpu,
+        momentum: momentum_cpu,
+        out_running_mean: running_mean_gpu,
+        out_running_var: running_var_gpu,
+        saved_mean: saved_mean_gpu,
+        saved_inv_var: saved_inv_var_gpu,
+        y: y_gpu,
+        comparison: comparison_gpu,
+        mask: mask_gpu,
+    }
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute(
+        variant_pack,
+        workspace,
+        handle=cudnn_handle,
+    )
+    torch.cuda.synchronize()
+
+    # reference computation
+    x_ref = x_gpu.clone().float()
+    running_mean_ref = running_mean_gpu.clone().float()
+    running_var_ref = running_var_gpu.clone().float()
+
+    y_before_relu_ref = torch.nn.functional.batch_norm(
+        x_ref,
+        running_mean_ref,  # running_mean is both input and output
+        running_var_ref,  # running_var is both input and output
+        weight=scale_gpu,
+        bias=bias_gpu,
+        training=True,
+        momentum=momentum_cpu.item(),
+        eps=epsilon_cpu.item(),
+    )
+
+    mean_ref = torch.mean(x_ref, dim=(0, 2, 3), keepdim=True)
+    inv_var_ref = torch.var(x_ref, dim=(0, 2, 3), keepdim=True)
+    inv_var_ref = torch.rsqrt(inv_var_ref + epsilon_value)
+    y_ref = torch.relu(y_before_relu_ref)
+    mask_ref = y_ref > 0
+
+    # Compare
+    # fmt: off
+    torch.testing.assert_close(y_ref, y_gpu.float(), atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(mean_ref, saved_mean_gpu.float(), atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(inv_var_ref, saved_inv_var_gpu.float(), atol=1e-3, rtol=1e-3)
+    # torch.testing.assert_close(mask_ref, mask_gpu.float(), atol=1e-3, rtol=1e-3)
+    # fmt: on
+
+
+@pytest.mark.parametrize("dump_dX_dRelu", [True, False], ids=lambda p: f"dump_dX_dRelu{int(p)}")
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9",
+    reason="DBN fusions not supported below cudnn 8.9",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_drelu_dadd_dbn(dump_dX_dRelu, cudnn_handle):
+    n, c, h, w = 4, 16, 56, 56
+    input_type = torch.float16
+
+    # input tensors
+    x_gpu = torch.randn(n, c, h, w, dtype=input_type, device="cuda")
+    x_gpu = x_gpu.to(memory_format=torch.channels_last)
+    x_mask_gpu = torch.randn_like(x_gpu) > 0.0
+    scale_gpu = torch.randn(1, c, 1, 1, device="cuda")
+    mean_gpu = torch.randn_like(scale_gpu)
+    inv_var_gpu = torch.randn_like(scale_gpu)
+    dY_gpu = torch.randn_like(x_gpu)
+
+    # output tensors
+    dScale_ref = torch.empty_like(scale_gpu)
+    dBias_ref = torch.empty_like(scale_gpu)
+    dX_ref = torch.empty_like(dY_gpu)
+
+    if dump_dX_dRelu:
+        dX_dRelu_gpu = torch.empty_like(dY_gpu)
+
+    # cudnn graph
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(input_type),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    x = graph.tensor_like(x_gpu)
+    x_mask = graph.tensor_like(x_mask_gpu)
+    scale = graph.tensor_like(scale_gpu)
+    mean = graph.tensor_like(mean_gpu)
+    inv_var = graph.tensor_like(inv_var_gpu)
+    dY = graph.tensor_like(dY_gpu)
+
+    dX_drelu = graph.scale(name="drelu", input=dY, scale=x_mask)
+    dX_drelu.set_data_type(cudnn.data_type.HALF)
+
+    if dump_dX_dRelu:
+        dX_drelu.set_output(True)
+
+    dX, dScale, dBias = graph.batchnorm_backward(
+        name="DBN",
+        grad=dX_drelu,
+        input=x,
+        scale=scale,
+        mean=mean,
+        inv_variance=inv_var,
+    )
+
+    dX.set_output(True).set_data_type(cudnn.data_type.HALF)
+    dScale.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    dBias.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans()
+
+    variant_pack = {
+        x: x_gpu,
+        x_mask: x_mask_gpu,
+        dY: dY_gpu,
+        scale: scale_gpu,
+        mean: mean_gpu,
+        inv_var: inv_var_gpu,
+        dX: dX_ref,
+        dScale: dScale_ref,
+        dBias: dBias_ref,
+    }
+    if dump_dX_dRelu:
+        variant_pack[dX_drelu] = dX_dRelu_gpu
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    graph.execute(variant_pack, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9.4",
+    reason="BN_infer-Drelu-DBN not supported below cudnn 8.9.4",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_bn_infer_drelu_dbn(cudnn_handle):
+    n, c, h, w = 4, 16, 56, 56
+    input_type = torch.float16
+
+    # input tensors
+    x_gpu = torch.randn(n, c, h, w, dtype=input_type, device="cuda")
+    x_gpu = x_gpu.to(memory_format=torch.channels_last)
+    scale_gpu = torch.randn(1, c, 1, 1, device="cuda")
+    bias_gpu = torch.randn_like(scale_gpu)
+    mean_gpu = torch.randn_like(scale_gpu)
+    inv_var_gpu = torch.randn_like(scale_gpu)
+    dY_gpu = torch.randn_like(x_gpu)
+
+    # output tensors
+    dScale_gpu = torch.empty_like(scale_gpu)
+    dBias_gpu = torch.empty_like(scale_gpu)
+    dX_gpu = torch.empty_like(x_gpu)
+
+    # cudnn graph
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    x = graph.tensor(
+        name="x",
+        dim=x_gpu.size(),
+        stride=x_gpu.stride(),
+        data_type=x_gpu.dtype,
+    )
+    dY = graph.tensor(name="dY", dim=dY_gpu.size(), stride=dY_gpu.stride(), data_type=dY_gpu.dtype)
+    scale = graph.tensor(
+        name="scale",
+        dim=scale_gpu.size(),
+        stride=scale_gpu.stride(),
+        data_type=scale_gpu.dtype,
+    )
+    bias = graph.tensor(
+        name="bias",
+        dim=bias_gpu.size(),
+        stride=bias_gpu.stride(),
+        data_type=bias_gpu.dtype,
+    )
+    mean = graph.tensor(
+        name="mean",
+        dim=mean_gpu.size(),
+        stride=mean_gpu.stride(),
+        data_type=mean_gpu.dtype,
+    )
+    inv_variance = graph.tensor(
+        name="inv_variance",
+        dim=inv_var_gpu.size(),
+        stride=inv_var_gpu.stride(),
+        data_type=inv_var_gpu.dtype,
+    )
+
+    y = graph.batchnorm_inference(input=x, mean=mean, inv_variance=inv_variance, scale=scale, bias=bias)
+
+    dX_dRelu = graph.relu_backward(loss=dY, input=y)
+
+    dX_dRelu.set_data_type(cudnn.data_type.HALF)
+
+    dX, dScale, dBias = graph.batchnorm_backward(
+        name="DBN",
+        grad=dX_dRelu,
+        input=x,
+        scale=scale,
+        mean=mean,
+        inv_variance=inv_variance,
+    )
+
+    dX.set_output(True)
+    dScale.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    dBias.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans()
+
+    variant_pack = {
+        x: x_gpu,
+        dY: dY_gpu,
+        scale: scale_gpu,
+        bias: bias_gpu,
+        mean: mean_gpu,
+        inv_variance: inv_var_gpu,
+        dX: dX_gpu,
+        dScale: dScale_gpu,
+        dBias: dBias_gpu,
+    }
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    graph.execute(variant_pack, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
diff --git a/third_party/cudnn-frontend/test/python/test_block_scale_quantize.py b/third_party/cudnn-frontend/test/python/test_block_scale_quantize.py
new file mode 100644
index 00000000..e0d8f94a
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_block_scale_quantize.py
@@ -0,0 +1,319 @@
+"""
+Test suite for block_scale_quantize Python API.
+Based on blackwell_nvfp4_mxfp8_block_scale_matmul.cpp
+"""
+
+import cudnn
+import pytest
+import torch
+
+from test_utils import torch_fork_set_rng
+
+
+def get_cc():
+    """Get CUDA compute capability."""
+    major, minor = torch.cuda.get_device_capability()
+    return major * 10 + minor
+
+
+def div_up(a, b):
+    """Integer division with rounding up."""
+    return (a + b - 1) // b
+
+
+def convert_to_cudnn_type(torch_type):
+    """Convert PyTorch dtype to cuDNN data type."""
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    elif torch_type == torch.float8_e4m3fn:
+        return cudnn.data_type.FP8_E4M3
+    elif torch_type == torch.float8_e5m2fn:
+        return cudnn.data_type.FP8_E5M2
+    elif torch_type == torch.float8_e8m0fnu:
+        return cudnn.data_type.FP8_E8M0
+    elif hasattr(torch, "float4_e2m1fn_x2") and torch_type == torch.float4_e2m1fn_x2:
+        return cudnn.data_type.FP4_E2M1
+    else:
+        raise ValueError(f"Unsupported tensor data type: {torch_type}")
+
+
+def calculate_block_scale_dims(m, n, k, block_size):
+    """
+    Calculate block scale dimensions using indestructible block formula.
+    Based on C++ lines 319-325, 454-463.
+    """
+    INDESTRUCTIBLE_128x4_BLOCK_M_N = 128
+    INDESTRUCTIBLE_128x4_BLOCK_K = 4
+
+    block_scale_dim_m = div_up(m, INDESTRUCTIBLE_128x4_BLOCK_M_N) * INDESTRUCTIBLE_128x4_BLOCK_M_N
+    block_scale_dim_n = div_up(n, INDESTRUCTIBLE_128x4_BLOCK_M_N) * INDESTRUCTIBLE_128x4_BLOCK_M_N
+    block_scale_dim_k = div_up(div_up(k, block_size), INDESTRUCTIBLE_128x4_BLOCK_K) * INDESTRUCTIBLE_128x4_BLOCK_K
+
+    # For output quantization (lines 461-463)
+    block_scale_dim_out_m = block_scale_dim_m
+    block_scale_dim_out_n = div_up(div_up(n, block_size), INDESTRUCTIBLE_128x4_BLOCK_K) * INDESTRUCTIBLE_128x4_BLOCK_K
+
+    return (
+        block_scale_dim_m,
+        block_scale_dim_n,
+        block_scale_dim_k,
+        block_scale_dim_out_m,
+        block_scale_dim_out_n,
+    )
+
+
+class TestBlockScaleQuantizeMatmul:
+    """
+    Test block_scale_quantize API with full matmul workflow.
+    Based on C++ TEST_CASE "Blackwell Block Scale Matmul Quantize" (lines 417-566).
+    """
+
+    @pytest.mark.skipif(
+        cudnn.backend_version() < 91400,
+        reason="block_scale_quantize requires cuDNN >= 9.14.0",
+    )
+    @pytest.mark.skipif(
+        get_cc() < 100 or get_cc() >= 110,
+        reason="block_scale_quantize requires CUDA compute capability 100-109",
+    )
+    @pytest.mark.parametrize(
+        "b,m,n,k,block_size,dtype_a,dtype_b,dtype_scale,dtype_output",
+        [
+            # FP4 tests - representative sample
+            (1, 256, 256, 256, 16, "FP4_E2M1", "FP4_E2M1", "FP8_E4M3", "FP4_E2M1"),
+            # FP8 E4M3 x E4M3 tests
+            (1, 128, 128, 128, 32, "FP8_E4M3", "FP8_E4M3", "FP8_E8M0", "FP8_E4M3"),
+            (1, 128, 128, 128, 32, "FP8_E4M3", "FP8_E4M3", "FP8_E8M0", "FP8_E5M2"),
+            # FP8 E4M3 x E5M2 mixed tests
+            (1, 128, 128, 128, 32, "FP8_E4M3", "FP8_E5M2", "FP8_E8M0", "FP8_E4M3"),
+            (1, 128, 128, 128, 32, "FP8_E4M3", "FP8_E5M2", "FP8_E8M0", "FP8_E5M2"),
+            # FP8 E5M2 x E4M3 mixed tests
+            (1, 128, 128, 128, 32, "FP8_E5M2", "FP8_E4M3", "FP8_E8M0", "FP8_E4M3"),
+            (1, 128, 128, 128, 32, "FP8_E5M2", "FP8_E4M3", "FP8_E8M0", "FP8_E5M2"),
+        ],
+    )
+    @pytest.mark.L0
+    @torch_fork_set_rng(seed=42)
+    def test_block_scale_quantize_matmul(
+        self,
+        cudnn_handle,
+        b,
+        m,
+        n,
+        k,
+        block_size,
+        dtype_a,
+        dtype_b,
+        dtype_scale,
+        dtype_output,
+    ):
+        """
+        Test block_scale_quantize in a full matmul workflow:
+        1. Create quantized inputs A, B with block scales
+        2. Dequantize A and B
+        3. Perform matmul
+        4. Quantize output using block_scale_quantize
+        5. Validate execution succeeds
+
+        This mirrors the C++ test at lines 417-566.
+        """
+        # Skip FP4 tests if PyTorch doesn't support it
+        if dtype_a == "FP4_E2M1" or dtype_b == "FP4_E2M1" or dtype_output == "FP4_E2M1":
+            if not hasattr(torch, "float4_e2m1fn_x2"):
+                pytest.skip("PyTorch does not support float4_e2m1fn_x2")
+
+        # Map string dtype names to cudnn data types
+        dtype_map = {
+            "FP4_E2M1": cudnn.data_type.FP4_E2M1,
+            "FP8_E4M3": cudnn.data_type.FP8_E4M3,
+            "FP8_E5M2": cudnn.data_type.FP8_E5M2,
+            "FP8_E8M0": cudnn.data_type.FP8_E8M0,
+        }
+
+        datatype_a = dtype_map[dtype_a]
+        datatype_b = dtype_map[dtype_b]
+        datatype_scale = dtype_map[dtype_scale]
+        datatype_output = dtype_map[dtype_output]
+
+        # Calculate block scale dimensions using indestructible block formula (lines 454-463)
+        (
+            block_scale_dim_m,
+            block_scale_dim_n,
+            block_scale_dim_k,
+            block_scale_dim_out_m,
+            block_scale_dim_out_n,
+        ) = calculate_block_scale_dims(m, n, k, block_size)
+
+        # Create graph (lines 484-487)
+        g = cudnn.pygraph(
+            io_data_type=cudnn.data_type.HALF,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+            handle=cudnn_handle,
+        )
+
+        # Input tensors A and B (lines 489-499)
+        tensor_a = g.tensor(
+            name="tensor_a",
+            dim=[b, m, k],
+            stride=[m * k, k, 1],
+            data_type=datatype_a,
+        )
+
+        tensor_b = g.tensor(
+            name="tensor_b",
+            dim=[b, k, n],
+            stride=[k * n, 1, k],
+            data_type=datatype_b,
+        )
+
+        # Block scale tensors with F8_128x4 reordering (lines 501-513)
+        block_descale_a = g.tensor(
+            name="block_descale_a",
+            dim=[b, block_scale_dim_m, block_scale_dim_k],
+            stride=[block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1],
+            data_type=datatype_scale,
+            reordering_type=cudnn.tensor_reordering.F8_128x4,
+        )
+
+        block_descale_b = g.tensor(
+            name="block_descale_b",
+            dim=[b, block_scale_dim_k, block_scale_dim_n],
+            stride=[block_scale_dim_n * block_scale_dim_k, 1, block_scale_dim_k],
+            data_type=datatype_scale,
+            reordering_type=cudnn.tensor_reordering.F8_128x4,
+        )
+
+        # Dequantize A (lines 515-517)
+        dequant_tensor_a = g.block_scale_dequantize(tensor_a, block_descale_a, block_size=[1, block_size], name="dequantize_a")
+
+        # Dequantize B (lines 519-521)
+        dequant_tensor_b = g.block_scale_dequantize(tensor_b, block_descale_b, block_size=[block_size, 1], name="dequantize_b")
+
+        # Matmul (lines 523-526)
+        tensor_c = g.matmul(
+            dequant_tensor_a,
+            dequant_tensor_b,
+            compute_data_type=cudnn.data_type.FLOAT,
+            name="matmul",
+        )
+
+        # ⭐ BLOCK SCALE QUANTIZE - THE KEY OPERATION (lines 528-531)
+        tensor_d, block_scale = g.block_scale_quantize(
+            tensor_c,
+            block_size=block_size,
+            axis=2,
+            transpose=False,
+            name="quantize_output",
+        )
+
+        # Set output properties (lines 533-536)
+        tensor_d.set_output(True).set_data_type(datatype_output)
+        block_scale.set_output(True).set_data_type(datatype_scale).set_reordering_type(cudnn.tensor_reordering.F8_128x4)
+
+        # Build and validate graph (lines 540-551)
+        g.validate()
+        g.build_operation_graph()
+        g.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        g.check_support()
+        g.build_plans()
+
+        # Allocate tensors
+        # Note: For simplicity, we allocate based on dimensions. In production,
+        # you'd calculate exact sizes using get_element_size_in_bits as in C++
+
+        # Create dummy input data (in practice, these would be properly quantized)
+        # Using uint8 as a generic container since we're just testing the graph execution
+        if dtype_a == "FP4_E2M1":
+            # FP4 is packed, so size is smaller
+            tensor_a_data = torch.randint(0, 16, (b, m, k // 2), dtype=torch.uint8, device="cuda")
+        elif dtype_a == "FP8_E4M3":
+            tensor_a_data = torch.randint(0, 256, (b, m, k), dtype=torch.uint8, device="cuda")
+        elif dtype_a == "FP8_E5M2":
+            tensor_a_data = torch.randint(0, 256, (b, m, k), dtype=torch.uint8, device="cuda")
+        else:
+            tensor_a_data = torch.randn((b, m, k), dtype=torch.float16, device="cuda")
+
+        if dtype_b == "FP4_E2M1":
+            tensor_b_data = torch.randint(0, 16, (b, k, n // 2), dtype=torch.uint8, device="cuda")
+        elif dtype_b == "FP8_E4M3":
+            tensor_b_data = torch.randint(0, 256, (b, k, n), dtype=torch.uint8, device="cuda")
+        elif dtype_b == "FP8_E5M2":
+            tensor_b_data = torch.randint(0, 256, (b, k, n), dtype=torch.uint8, device="cuda")
+        else:
+            tensor_b_data = torch.randn((b, k, n), dtype=torch.float16, device="cuda")
+
+        # Scale tensors
+        if dtype_scale == "FP8_E4M3":
+            scale_a_data = torch.ones(
+                (b, block_scale_dim_m, block_scale_dim_k),
+                dtype=torch.float8_e4m3fn,
+                device="cuda",
+            )
+            scale_b_data = torch.ones(
+                (b, block_scale_dim_k, block_scale_dim_n),
+                dtype=torch.float8_e4m3fn,
+                device="cuda",
+            )
+            scale_output_data = torch.empty(
+                (b, block_scale_dim_out_m, block_scale_dim_out_n),
+                dtype=torch.float8_e4m3fn,
+                device="cuda",
+            )
+        else:  # FP8_E8M0
+            scale_a_data = torch.ones(
+                (b, block_scale_dim_m, block_scale_dim_k),
+                dtype=torch.float8_e8m0fnu,
+                device="cuda",
+            )
+            scale_b_data = torch.ones(
+                (b, block_scale_dim_k, block_scale_dim_n),
+                dtype=torch.float8_e8m0fnu,
+                device="cuda",
+            )
+            scale_output_data = torch.empty(
+                (b, block_scale_dim_out_m, block_scale_dim_out_n),
+                dtype=torch.float8_e8m0fnu,
+                device="cuda",
+            )
+
+        # Output tensor
+        if dtype_output == "FP4_E2M1":
+            tensor_d_data = torch.empty((b, m, n // 2), dtype=torch.uint8, device="cuda")
+        elif dtype_output == "FP8_E4M3":
+            tensor_d_data = torch.empty((b, m, n), dtype=torch.uint8, device="cuda")
+        elif dtype_output == "FP8_E5M2":
+            tensor_d_data = torch.empty((b, m, n), dtype=torch.uint8, device="cuda")
+        else:
+            tensor_d_data = torch.empty((b, m, n), dtype=torch.float16, device="cuda")
+
+        # Get workspace
+        workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+        # Execute (lines 557-565)
+        variant_pack = {
+            tensor_a: tensor_a_data,
+            tensor_b: tensor_b_data,
+            block_descale_a: scale_a_data,
+            block_descale_b: scale_b_data,
+            tensor_d: tensor_d_data,
+            block_scale: scale_output_data,
+        }
+
+        g.execute(variant_pack, workspace, handle=cudnn_handle)
+
+        # If we reach here without exceptions, the test passed
+        assert tensor_d_data is not None
+        assert scale_output_data is not None
+
+        print(
+            f"✓ Test passed: b={b}, m={m}, n={n}, k={k}, " f"dtype_a={dtype_a}, dtype_b={dtype_b}, " f"dtype_scale={dtype_scale}, dtype_output={dtype_output}"
+        )
diff --git a/third_party/cudnn-frontend/test/python/test_block_scale_quantize_dynamic_shape.py b/third_party/cudnn-frontend/test/python/test_block_scale_quantize_dynamic_shape.py
new file mode 100644
index 00000000..8eb1eb52
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_block_scale_quantize_dynamic_shape.py
@@ -0,0 +1,233 @@
+"""
+Test suite for block_scale_quantize with dynamic shape overrides Python API.
+Based on blackwell_nvfp4_mxfp8_block_scale_matmul.cpp
+"""
+
+import cudnn
+import pytest
+import torch
+
+from test_utils import torch_fork_set_rng
+
+
+def get_cc():
+    """Get CUDA compute capability."""
+    major, minor = torch.cuda.get_device_capability()
+    return major * 10 + minor
+
+
+def div_up(a, b):
+    """Integer division with rounding up."""
+    return (a + b - 1) // b
+
+
+def calculate_block_scale_dims(m, n, k, block_size):
+    """
+    Calculate block scale dimensions using indestructible block formula.
+    Based on C++ lines 319-325, 454-463.
+    """
+    INDESTRUCTIBLE_128x4_BLOCK_M_N = 128
+    INDESTRUCTIBLE_128x4_BLOCK_K = 4
+
+    block_scale_dim_m = div_up(m, INDESTRUCTIBLE_128x4_BLOCK_M_N) * INDESTRUCTIBLE_128x4_BLOCK_M_N
+    block_scale_dim_n = div_up(n, INDESTRUCTIBLE_128x4_BLOCK_M_N) * INDESTRUCTIBLE_128x4_BLOCK_M_N
+    block_scale_dim_k = div_up(div_up(k, block_size), INDESTRUCTIBLE_128x4_BLOCK_K) * INDESTRUCTIBLE_128x4_BLOCK_K
+
+    return block_scale_dim_m, block_scale_dim_n, block_scale_dim_k
+
+
+class TestBlockScaleQuantizeMatmulDynamicShape:
+    """
+    Test block_scale_quantize API with full matmul workflow.
+    Based on C++ TEST_CASE "Blackwell Block Scale Matmul dynamic shape overrides" (lines 749-910).
+    """
+
+    @pytest.mark.skipif(
+        cudnn.backend_version() < 91800,
+        reason="block_scale_quantize requires cuDNN >= 9.18.0",
+    )
+    @pytest.mark.skipif(
+        get_cc() < 100,
+        reason="block_scale_quantize requires CUDA compute capability larger than 100",
+    )
+    @pytest.mark.parametrize(
+        "b,m,n,k",
+        [
+            (1, 1024, 1024, 1024),
+        ],
+    )
+    @pytest.mark.L0
+    @torch_fork_set_rng(seed=999)
+    def test_block_scale_quantize_matmul_dynamic_shape(self, cudnn_handle, b, m, n, k):
+        """
+        Test block_scale_quantize in a full matmul workflow:
+        1. Create quantized inputs A, B with block scales
+        2. Dequantize A and B
+        3. Perform matmul
+        4. Quantize output using block_scale_quantize
+        5. Validate execution succeeds
+
+        This mirrors the C++ test at lines 749-910.
+        """
+        # Skip FP4 tests if PyTorch doesn't support it
+        if not hasattr(torch, "float4_e2m1fn_x2"):
+            pytest.skip("PyTorch does not support float4_e2m1fn_x2")
+
+        A_UID = 1
+        SF_A_UID = 2
+        B_UID = 3
+        SF_B_UID = 4
+        C_UID = 5
+
+        datatype_a = cudnn.data_type.FP4_E2M1
+        datatype_b = cudnn.data_type.FP4_E2M1
+        datatype_scale = cudnn.data_type.FP8_E4M3
+        datatype_output = cudnn.data_type.BFLOAT16
+        block_size = 16
+
+        matmul_dynamic_shapes = [
+            {"b": 2, "m": 1024, "n": 1024, "k": 1024},
+            {"b": 2, "m": 2048, "n": 2048, "k": 2048},
+        ]
+
+        graph = cudnn.pygraph(
+            io_data_type=cudnn.data_type.FLOAT,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+            handle=cudnn_handle,
+            is_dynamic_shape_enabled=True,
+        )
+
+        block_scale_dim_m, block_scale_dim_n, block_scale_dim_k = calculate_block_scale_dims(m, n, k, block_size)
+
+        A = graph.tensor(
+            name="A",
+            uid=A_UID,
+            dim=[b, m, k],
+            stride=[m * k, k, 1],
+            data_type=datatype_a,
+        )
+
+        SF_A = graph.tensor(
+            name="SF_A",
+            uid=SF_A_UID,
+            dim=[b, block_scale_dim_m, block_scale_dim_k],
+            stride=[block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1],
+            data_type=datatype_scale,
+            reordering_type=cudnn.tensor_reordering.F8_128x4,
+        )
+
+        dequan_tensor_a = graph.block_scale_dequantize(A, SF_A, block_size=[1, block_size], name="dequantize_a")
+
+        B = graph.tensor(
+            name="B",
+            uid=B_UID,
+            dim=[b, k, n],
+            stride=[n * k, 1, k],
+            data_type=datatype_b,
+        )
+
+        SF_B = graph.tensor(
+            name="SF_B",
+            uid=SF_B_UID,
+            dim=[b, block_scale_dim_k, block_scale_dim_n],
+            stride=[block_scale_dim_n * block_scale_dim_k, 1, block_scale_dim_k],
+            data_type=datatype_scale,
+            reordering_type=cudnn.tensor_reordering.F8_128x4,
+        )
+
+        dequan_tensor_b = graph.block_scale_dequantize(B, SF_B, block_size=[block_size, 1], name="dequantize_b")
+
+        C = graph.matmul(
+            dequan_tensor_a,
+            dequan_tensor_b,
+            compute_data_type=cudnn.data_type.FLOAT,
+            name="matmul",
+        )
+        C.set_uid(C_UID).set_output(True).set_data_type(datatype_output)
+
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+
+        for dynamic_shape in matmul_dynamic_shapes:
+            block_scale_dim_m, block_scale_dim_n, block_scale_dim_k = calculate_block_scale_dims(
+                dynamic_shape["m"],
+                dynamic_shape["n"],
+                dynamic_shape["k"],
+                block_size,
+            )
+
+            override_uids = [A_UID, SF_A_UID, B_UID, SF_B_UID, C_UID]
+
+            override_shapes = [
+                [dynamic_shape["b"], dynamic_shape["m"], dynamic_shape["k"]],
+                [dynamic_shape["b"], block_scale_dim_m, block_scale_dim_k],
+                [dynamic_shape["b"], dynamic_shape["k"], dynamic_shape["n"]],
+                [dynamic_shape["b"], block_scale_dim_k, block_scale_dim_n],
+                [dynamic_shape["b"], dynamic_shape["m"], dynamic_shape["n"]],
+            ]
+
+            override_strides = [
+                [dynamic_shape["m"] * dynamic_shape["k"], dynamic_shape["k"], 1],
+                [block_scale_dim_m * block_scale_dim_k, block_scale_dim_k, 1],
+                [dynamic_shape["n"] * dynamic_shape["k"], 1, dynamic_shape["k"]],
+                [block_scale_dim_n * block_scale_dim_k, 1, block_scale_dim_k],
+                [dynamic_shape["m"] * dynamic_shape["n"], dynamic_shape["n"], 1],
+            ]
+
+            A_gpu = torch.randint(
+                0,
+                256,
+                (dynamic_shape["b"], dynamic_shape["m"], dynamic_shape["k"] // 2),
+                dtype=torch.uint8,
+                device="cuda",
+            )
+            SF_A_gpu = torch.ones(
+                (b, block_scale_dim_m, block_scale_dim_k),
+                dtype=torch.float8_e4m3fn,
+                device="cuda",
+            )
+            B_gpu = torch.randint(
+                0,
+                256,
+                (dynamic_shape["b"], dynamic_shape["k"] // 2, dynamic_shape["n"]),
+                dtype=torch.uint8,
+                device="cuda",
+            )
+            SF_B_gpu = torch.ones(
+                (b, block_scale_dim_k, block_scale_dim_n),
+                dtype=torch.float8_e4m3fn,
+                device="cuda",
+            )
+            C_gpu = torch.empty(
+                (dynamic_shape["b"], dynamic_shape["m"], dynamic_shape["n"]),
+                dtype=torch.bfloat16,
+                device="cuda",
+            )
+
+            variant_pack = {
+                A_UID: A_gpu,
+                SF_A_UID: SF_A_gpu,
+                B_UID: B_gpu,
+                SF_B_UID: SF_B_gpu,
+                C_UID: C_gpu,
+            }
+
+            workspace_size = graph.get_workspace_size()
+            workspace = torch.empty(workspace_size, dtype=torch.uint8, device="cuda")
+
+            graph.execute(
+                variant_pack,
+                workspace,
+                handle=cudnn_handle,
+                override_uids=override_uids,
+                override_shapes=override_shapes,
+                override_strides=override_strides,
+            )
+
+            torch.cuda.synchronize()
+
+        print(f"✓ Test passed: b={b}, m={m}, n={n}, k={k}")
diff --git a/third_party/cudnn-frontend/test/python/test_conv_bias.py b/third_party/cudnn-frontend/test/python/test_conv_bias.py
new file mode 100644
index 00000000..67391a13
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_conv_bias.py
@@ -0,0 +1,421 @@
+import cudnn
+import pytest
+import torch
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+
+class CSBR(torch.nn.Module):
+    def forward(
+        self,
+        x,
+        w,
+        b=None,
+        padding=[1, 1],
+        stride=[1, 1],
+        dilation=[1, 1],
+        lower_clip=0.0,
+        upper_clip=128,
+    ):
+        if b is not None:
+            b = b.reshape(-1)  # Conv2d needs a 1D tensor
+        conv_output = torch.nn.functional.conv2d(x, w, bias=b, padding=padding, stride=stride, dilation=dilation)
+        return torch.clamp(conv_output, min=lower_clip, max=upper_clip)
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+def create_conv_bias_relu_graph(
+    handle,
+    X_gpu,
+    W_gpu,
+    B_gpu,
+    padding,
+    stride,
+    dilation,
+    lower_clip=0.5,
+    upper_clip=0.55,
+):
+    with cudnn.graph(
+        handle,
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    ) as (g, _):
+        X = g.tensor_like(X_gpu)
+        W = g.tensor_like(W_gpu)
+        B = g.tensor_like(B_gpu)
+
+        conv_output = g.conv_fprop(
+            image=X,
+            weight=W,
+            pre_padding=padding,
+            post_padding=padding,
+            stride=stride,
+            dilation=dilation,
+        )
+
+        bias_output = g.bias(name="bias", input=conv_output, bias=B)
+        Y = g.relu(name="relu", input=bias_output, lower_clip=lower_clip, upper_clip=upper_clip)
+        Y.set_output(True)
+
+        return g, [X, W, B, Y]
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+def create_conv_relu_graph(handle, X_gpu, W_gpu, padding, stride, dilation, lower_clip=0.5, upper_clip=0.55):
+    with cudnn.graph(
+        handle,
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    ) as (g, _):
+        X = g.tensor_like(X_gpu)
+        W = g.tensor_like(W_gpu)
+
+        conv_output = g.conv_fprop(image=X, weight=W, padding=padding, stride=stride, dilation=dilation)
+
+        Y = g.relu(name="relu", input=conv_output, lower_clip=lower_clip, upper_clip=upper_clip)
+        Y.set_output(True)
+
+        return g, [X, W, Y]
+
+
+@pytest.mark.L0
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.5.0",
+    reason="requires cudnn 9.5.0 or higher",
+)
+@torch_fork_set_rng(seed=0)
+def test_conv_bias_relu(cudnn_handle):
+    # Reference code
+    X_gpu = torch.randn(4, 16, 56, 56, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(16, 16, 3, 3, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    B_gpu = torch.randn(1, 16, 1, 1, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    padding = [1, 1]
+    stride = [3, 3]
+    dilation = [1, 1]
+
+    # Get reference result
+    model = CSBR().eval().to("cuda").to(torch.float16)
+    Y_expected = model(
+        X_gpu,
+        W_gpu,
+        b=B_gpu,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        lower_clip=0.5,
+        upper_clip=0.55,
+    )
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    single_mode_graph = cudnn.jit(heur_modes=cudnn.heur_mode.A)(create_conv_bias_relu_graph.__wrapped__)
+    g, uids = single_mode_graph(cudnn_handle, X_gpu, W_gpu, B_gpu, padding, stride, dilation)
+
+    X_uid, W_uid, B_uid, Y_uid = uids
+
+    Y_actual = torch.zeros_like(Y_expected)
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute(
+        {X_uid: X_gpu, W_uid: W_gpu, B_uid: B_gpu, Y_uid: Y_actual},
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=0.05, rtol=1e-2)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_conv_relu(cudnn_handle):
+    # Reference code
+    X_gpu = torch.randn(20, 40, 30, 40, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(54, 40, 3, 4, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    padding = [0, 1]
+    stride = [2, 3]
+    dilation = [1, 1]
+
+    # Get reference result
+    model = CSBR().eval().to("cuda").to(torch.float16)
+    Y_expected = model(
+        X_gpu,
+        W_gpu,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        lower_clip=0.5,
+        upper_clip=0.55,
+    )
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    g, uids = create_conv_relu_graph(cudnn_handle, X_gpu, W_gpu, padding, stride, dilation)
+    X_uid, W_uid, Y_uid = uids
+
+    Y_actual = torch.zeros_like(Y_expected)
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute({X_uid: X_gpu, W_uid: W_gpu, Y_uid: Y_actual}, workspace, handle=cudnn_handle)
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_conv_relu_execution_plan_creation(cudnn_handle):
+    # Reference code
+    X_gpu = torch.randn(20, 40, 30, 40, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(54, 40, 3, 4, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    padding = [0, 1]
+    stride = [2, 3]
+    dilation = [1, 1]
+    model = CSBR().eval().to("cuda").to(torch.float16)
+    Y_expected = model(
+        X_gpu,
+        W_gpu,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        lower_clip=0.5,
+        upper_clip=0.55,
+    )
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Cudnn code
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor(name="X", dim=X_gpu.size(), stride=X_gpu.stride(), data_type=X_gpu.dtype)
+    W = graph.tensor(name="W", dim=W_gpu.size(), stride=W_gpu.stride(), data_type=W_gpu.dtype)
+
+    conv_output = graph.conv_fprop(image=X, weight=W, padding=padding, stride=stride, dilation=dilation)
+
+    Y = graph.relu(name="relu", input=conv_output, lower_clip=0.5, upper_clip=0.55)
+    Y.set_output(True)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    # Build all unique kernel cfg plans
+    for engine in range(graph.get_engine_count()):
+        try:
+            knobs = graph.get_knobs_for_engine(engine)
+        except RuntimeError:
+            continue
+
+        for knob in knobs:
+            if knob.type == cudnn.knob_type.KERNEL_CFG:
+                for kernel_cfg in range(knob.min_value, knob.max_value + 1, knob.stride):
+                    try:
+                        graph.create_execution_plan(engine, {cudnn.knob_type.KERNEL_CFG: kernel_cfg})
+                    except RuntimeError:
+                        continue
+
+    graph.check_support()
+    graph.build_plans()
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    Y_actual = torch.zeros_like(Y_expected)
+    graph.execute({X: X_gpu, W: W_gpu, Y: Y_actual}, workspace, handle=cudnn_handle)
+    # Compare
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+def create_conv3d_bias_leaky_relu_graph(handle, X_gpu, W_gpu, B_gpu, padding, stride, dilation, negative_slope):
+    with cudnn.graph(
+        handle,
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    ) as (g, _):
+        X = g.tensor_like(X_gpu)
+        W = g.tensor_like(W_gpu)
+        B = g.tensor_like(B_gpu)
+
+        conv_output = g.conv_fprop(image=X, weight=W, padding=padding, stride=stride, dilation=dilation)
+
+        bias_output = g.bias(name="bias", input=conv_output, bias=B)
+        Y = g.leaky_relu(name="relu", input=bias_output, negative_slope=negative_slope)
+        Y.set_output(True)
+
+        return g, [X, W, B, Y]
+
+
+@pytest.mark.L0
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.5.0",
+    reason="requires cudnn 9.5.0 or higher",
+)
+@torch_fork_set_rng(seed=0)
+def test_conv3d_bias_leaky_relu(cudnn_handle):
+    N, C, D, H, W = 4, 16, 52, 54, 56
+    K, R, S, T = 32, 3, 3, 3
+    padding = [0, 1, 2]
+    stride = [2, 3, 4]
+    dilation = [1, 1, 1]
+    negative_slope = 0.01
+
+    # Reference code
+    X_gpu = torch.randn(N, D, H, W, C, device="cuda", dtype=torch.float16).permute(0, 4, 1, 2, 3)
+    W_gpu = torch.randn(K, R, S, T, C, device="cuda", dtype=torch.float16).permute(0, 4, 1, 2, 3)
+    B_gpu = torch.randn(1, 1, 1, 1, K, device="cuda", dtype=torch.float16).permute(0, 4, 1, 2, 3)
+
+    # Get reference result
+    conv_out_expected = (
+        torch.nn.functional.conv3d(
+            X_gpu,
+            W_gpu,
+            bias=B_gpu.reshape(-1),
+            padding=padding,
+            stride=stride,
+            dilation=dilation,
+        )
+        .to("cuda")
+        .to(torch.float16)
+    )
+    Y_expected = torch.nn.functional.leaky_relu(conv_out_expected, negative_slope=negative_slope)
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    g, uids = create_conv3d_bias_leaky_relu_graph(cudnn_handle, X_gpu, W_gpu, B_gpu, padding, stride, dilation, negative_slope)
+    X_uid, W_uid, B_uid, Y_uid = uids
+
+    Y_actual = torch.zeros_like(Y_expected)
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute(
+        {X_uid: X_gpu, W_uid: W_gpu, B_uid: B_gpu, Y_uid: Y_actual},
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2)
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+def create_leaky_relu_backward_graph(handle, loss_gpu, input_gpu, negative_slope):
+    with cudnn.graph(
+        handle,
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    ) as (g, _):
+        loss = g.tensor_like(loss_gpu)
+        input = g.tensor_like(input_gpu)
+
+        Y = g.leaky_relu_backward(loss=loss, input=input, negative_slope=negative_slope)
+        Y.set_output(True)
+
+        return g, [loss, input, Y]
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_leaky_relu_backward(cudnn_handle):
+    N, C, H, W = 4, 16, 56, 56
+    negative_slope = 0.01
+
+    # Reference code
+    loss_gpu = torch.randn(N, C, H, W, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    input_gpu = torch.randn(N, C, H, W, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+
+    def dleaky_relu(grad: torch.Tensor, mask: torch.Tensor, negative_slope: float):
+        return torch.ones_like(grad).masked_fill_(mask <= 0.0, negative_slope) * grad
+
+    Y_expected = dleaky_relu(loss_gpu, input_gpu, negative_slope)
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    g, uids = create_leaky_relu_backward_graph(cudnn_handle, loss_gpu, input_gpu, negative_slope)
+    loss_uid, input_uid, Y_uid = uids
+
+    Y_actual = torch.zeros_like(Y_expected)
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute(
+        {loss_uid: loss_gpu, input_uid: input_gpu, Y_uid: Y_actual},
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-4, rtol=1e-4)
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+def create_conv_int8_graph(handle, X_gpu, W_gpu, padding, stride, dilation):
+    with cudnn.graph(
+        handle,
+        io_data_type=cudnn.data_type.INT8,
+        intermediate_data_type=cudnn.data_type.INT32,
+        compute_data_type=cudnn.data_type.INT32,
+    ) as (g, _):
+        X = g.tensor_like(X_gpu)
+        W = g.tensor_like(W_gpu)
+
+        conv_output = g.conv_fprop(image=X, weight=W, padding=padding, stride=stride, dilation=dilation)
+        Y = g.identity(name="identity", input=conv_output)
+        Y.set_output(True).set_data_type(cudnn.data_type.INT32)
+
+        return g, [X, W, Y]
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.6",
+    reason="requires cudnn 8.6.0 or higher",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_conv_int8(cudnn_handle):
+    N, C, H, W = 2, 64, 32, 32
+    K, R, S = 4, 3, 3
+    padding = [1, 1]
+    stride = [1, 1]
+    dilation = [1, 1]
+
+    compare_output = True
+
+    # Reference code
+    X_gpu = torch.randint(-127, 128, (N, C, H, W), device="cuda", dtype=torch.int8).to(memory_format=torch.channels_last)
+    W_gpu = torch.randint(-127, 128, (K, C, R, S), device="cuda", dtype=torch.int8).to(memory_format=torch.channels_last)
+
+    try:
+        Y_expected = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=padding, stride=stride, dilation=dilation).to("cuda").to(torch.int32)
+    except:
+        print("Torch does not support int8 convolution. Disabling comparison of output tensor")
+        compare_output = False
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    g, uids = create_conv_int8_graph(cudnn_handle, X_gpu, W_gpu, padding, stride, dilation)
+    X_uid, W_uid, Y_uid = uids
+
+    Y_actual = torch.randint(0, 127, X_gpu.size(), device="cuda", dtype=torch.int32).to(memory_format=torch.channels_last)
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute({X_uid: X_gpu, W_uid: W_gpu, Y_uid: Y_actual}, workspace, handle=cudnn_handle)
+
+    torch.cuda.synchronize()
+
+    if compare_output:
+        torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2)
diff --git a/third_party/cudnn-frontend/test/python/test_conv_fprop.py b/third_party/cudnn-frontend/test/python/test_conv_fprop.py
new file mode 100644
index 00000000..6ac4c06b
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_conv_fprop.py
@@ -0,0 +1,112 @@
+"""
+Test for conv fprop using tvm-ffi based execute API.
+
+This test validates that the TVM-FFI migration for PyGraph::execute
+works correctly by running a simple convolution forward pass.
+"""
+
+import cudnn
+import pytest
+import torch
+from test_utils import torch_fork_set_rng
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_conv_fprop_tvm_ffi(cudnn_handle):
+    """Test conv fprop using the tvm-ffi based execute API."""
+    # Setup tensors
+    X_gpu = torch.randn(4, 16, 32, 32, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(32, 16, 3, 3, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    padding = [1, 1]
+    stride = [1, 1]
+    dilation = [1, 1]
+
+    # Reference result using PyTorch
+    Y_expected = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=padding, stride=stride, dilation=dilation)
+
+    # Set stream
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Build cudnn graph
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor_like(X_gpu)
+    W = graph.tensor_like(W_gpu)
+
+    conv_output = graph.conv_fprop(image=X, weight=W, padding=padding, stride=stride, dilation=dilation)
+    conv_output.set_output(True)
+
+    graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+
+    Y_actual = torch.zeros_like(Y_expected)
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    # Execute using the tvm-ffi based execute API
+    graph.execute(
+        {X: X_gpu, W: W_gpu, conv_output: Y_actual},
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_conv_fprop_execute_plan_at_index_tvm_ffi(cudnn_handle):
+    """Test conv fprop using execute_plan_at_index with tvm-ffi."""
+    # Setup tensors
+    X_gpu = torch.randn(2, 8, 16, 16, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(16, 8, 3, 3, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    padding = [1, 1]
+    stride = [1, 1]
+    dilation = [1, 1]
+
+    # Reference result using PyTorch
+    Y_expected = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=padding, stride=stride, dilation=dilation)
+
+    # Set stream
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Build cudnn graph
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor_like(X_gpu)
+    W = graph.tensor_like(W_gpu)
+
+    conv_output = graph.conv_fprop(image=X, weight=W, padding=padding, stride=stride, dilation=dilation)
+    conv_output.set_output(True)
+
+    graph.validate()
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    graph.check_support()
+    graph.build_plans()
+
+    Y_actual = torch.zeros_like(Y_expected)
+    workspace = torch.empty(graph.get_workspace_size_plan_at_index(0), device="cuda", dtype=torch.uint8)
+
+    # Execute using execute_plan_at_index with tvm-ffi
+    graph.execute_plan_at_index(
+        {X: X_gpu, W: W_gpu, conv_output: Y_actual},
+        workspace,
+        index=0,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
diff --git a/third_party/cudnn-frontend/test/python/test_conv_fuzzer.py b/third_party/cudnn-frontend/test/python/test_conv_fuzzer.py
new file mode 100644
index 00000000..e2f423cc
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_conv_fuzzer.py
@@ -0,0 +1,1158 @@
+"""
+Convolution Fuzzer - Randomized stress testing for cuDNN convolution operations.
+
+This fuzzer tests convolution operations with randomized:
+- Shapes (batch, channels, spatial dimensions)
+- Spatial dimensions (2D or 3D)
+- Data types (fp16, bf16, fp32, int8)
+- Convolution parameters (padding, stride, dilation)
+- Operation types (fprop, dgrad, wgrad)
+- Epilogues (none, bias, relu, bias_relu)
+
+Layout: NHWC/NDHWC (channels last) for memory layout
+Logical dimension order: N, C, spatial_dims...
+
+Run with:
+    pytest -vv -s -rA test_conv_fuzzer.py
+
+Options:
+    --num-tests N       Number of random tests to run (default: 100)
+    --seed N            Random seed for reproducibility (default: random)
+    --diffs N           Number of mismatches to display (default: 10)
+"""
+
+import cudnn
+import pytest
+import random
+import torch
+import math
+import sys
+import signal
+from datetime import datetime
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from enum import IntEnum
+
+# fmt: off
+
+# Handle Ctrl-C gracefully
+def signal_handler(sig, frame):
+    print("\n\nInterrupted by user (Ctrl-C), exiting...")
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    sys.exit(1)
+
+signal.signal(signal.SIGINT, signal_handler)
+
+if __name__ == "__main__":
+    print("This is pytest script. Run with: pytest -vv -s -rA test_conv_fuzzer.py")
+    sys.exit(0)
+
+
+# ============================================================================
+# Configuration and Constants
+# ============================================================================
+
+class ConvType(IntEnum):
+    FPROP = 0   # Forward convolution
+    DGRAD = 1   # Input gradient (backward data)
+    WGRAD = 2   # Weight gradient (backward filter)
+
+class EpilogueType(IntEnum):
+    NONE = 0
+    BIAS = 1
+    RELU = 2
+    BIAS_RELU = 3
+
+SUPPORTED_DTYPES = [
+    torch.float16,
+    torch.bfloat16,
+    torch.float32,
+]
+
+# int8 convolutions have stricter requirements, test separately
+SUPPORTED_DTYPES_WITH_INT8 = SUPPORTED_DTYPES + [torch.int8]
+
+
+# ============================================================================
+# Utility Functions
+# ============================================================================
+
+def convert_to_cudnn_type(torch_type):
+    """Convert PyTorch dtype to cuDNN data type."""
+    mapping = {
+        torch.float16: cudnn.data_type.HALF,
+        torch.bfloat16: cudnn.data_type.BFLOAT16,
+        torch.float32: cudnn.data_type.FLOAT,
+        torch.bool: cudnn.data_type.BOOLEAN,
+        torch.uint8: cudnn.data_type.UINT8,
+        torch.int8: cudnn.data_type.INT8,
+        torch.int32: cudnn.data_type.INT32,
+        torch.int64: cudnn.data_type.INT64,
+    }
+    if torch_type not in mapping:
+        raise ValueError(f"Unsupported tensor data type: {torch_type}")
+    return mapping[torch_type]
+
+
+def get_gpu_arch():
+    """Get GPU SM architecture version."""
+    major, minor = torch.cuda.get_device_capability()
+    return f"SM_{major * 10 + minor}"
+
+
+def get_sm_count():
+    """Get number of SMs on the GPU."""
+    props = torch.cuda.get_device_properties(0)
+    return props.multi_processor_count
+
+
+def get_gpu_name():
+    """Get GPU name."""
+    return torch.cuda.get_device_name()
+
+
+def conv_type_name(conv_type: ConvType) -> str:
+    """Get human-readable conv type name."""
+    names = {
+        ConvType.FPROP: "fprop",
+        ConvType.DGRAD: "dgrad",
+        ConvType.WGRAD: "wgrad",
+    }
+    return names.get(conv_type, "unknown")
+
+
+def epilogue_name(epilogue: EpilogueType) -> str:
+    """Get human-readable epilogue name."""
+    names = {
+        EpilogueType.NONE: "none",
+        EpilogueType.BIAS: "bias",
+        EpilogueType.RELU: "relu",
+        EpilogueType.BIAS_RELU: "bias_relu",
+    }
+    return names.get(epilogue, "unknown")
+
+
+def compute_channels_last_strides(shape: Tuple[int, ...]) -> Tuple[int, ...]:
+    """
+    Compute channels-last strides for NHWC (2D) or NDHWC (3D) layout.
+
+    Logical dim order: (N, C, spatial_dims...)
+    Memory order: N, spatial_dims..., C
+
+    For 2D (NCHW logical -> NHWC memory):
+        shape = (N, C, H, W)
+        memory_order = (N, H, W, C) -> strides computed from last to first
+        strides[N] = H*W*C, strides[C] = 1, strides[H] = W*C, strides[W] = C
+
+    For 3D (NCDHW logical -> NDHWC memory):
+        shape = (N, C, D, H, W)
+        memory_order = (N, D, H, W, C) -> strides computed from last to first
+    """
+    ndim = len(shape)
+    if ndim < 3:
+        raise ValueError(f"Shape must have at least 3 dimensions, got {ndim}")
+
+    # shape = (N, C, spatial_dims...)
+    N = shape[0]
+    C = shape[1]
+    spatial = shape[2:]  # (H, W) or (D, H, W)
+
+    # Memory layout: (N, spatial_dims..., C)
+    # Compute strides from innermost to outermost
+    strides = [0] * ndim
+
+    # C is innermost in memory (stride = 1)
+    strides[1] = 1
+
+    # Spatial dims next (reversed order in memory)
+    stride = C
+    for i in range(ndim - 1, 1, -1):  # W, H, [D] order
+        strides[i] = stride
+        stride *= shape[i]
+
+    # N is outermost
+    strides[0] = stride
+
+    return tuple(strides)
+
+
+def compute_num_elements(shape: Tuple[int, ...], strides: Tuple[int, ...]) -> int:
+    """Compute number of elements needed for storage given shape and strides."""
+    if not shape:
+        return 1
+    max_offset = sum((d - 1) * s for d, s in zip(shape, strides))
+    return max_offset + 1
+
+
+def compute_output_spatial(input_spatial: int, filter_spatial: int,
+                           padding: int, stride: int, dilation: int) -> int:
+    """Compute output spatial dimension for convolution."""
+    effective_filter = (filter_spatial - 1) * dilation + 1
+    return (input_spatial + 2 * padding - effective_filter) // stride + 1
+
+
+def fill_with_garbage(tensor: torch.Tensor, nan_probability: float = 0.1) -> None:
+    """
+    Fill tensor with garbage values (mix of random values and NaNs).
+    This helps catch bugs where cuDNN doesn't write all output locations.
+    """
+    # Choose range based on dtype to avoid overflow
+    if tensor.dtype in (torch.float16, torch.bfloat16):
+        lo, hi = -1e4, 1e4  # FP16 max is ~65504
+    else:
+        lo, hi = -1e6, 1e6
+
+    # Fill with random garbage
+    tensor.uniform_(lo, hi)
+
+    # Sprinkle in some NaNs (only for float types)
+    if nan_probability > 0 and tensor.dtype in (torch.float16, torch.bfloat16, torch.float32, torch.float64):
+        nan_mask = torch.rand(tensor.shape, device=tensor.device) < nan_probability
+        tensor[nan_mask] = float('nan')
+
+
+# ============================================================================
+# Test Configuration
+# ============================================================================
+
+@dataclass
+class ConvConfig:
+    """Configuration for a single convolution test."""
+    # Spatial dimensions (2 for 2D, 3 for 3D)
+    spatial_dims: int
+
+    # Basic dimensions
+    batch: int           # N
+    in_channels: int     # C_in
+    out_channels: int    # C_out (K)
+
+    # Spatial sizes: (H, W) for 2D or (D, H, W) for 3D
+    input_spatial: Tuple[int, ...]   # Input spatial dimensions
+    filter_spatial: Tuple[int, ...]  # Filter spatial dimensions
+
+    # Convolution parameters (per spatial dimension)
+    padding: Tuple[int, ...]
+    stride: Tuple[int, ...]
+    dilation: Tuple[int, ...]
+
+    # Operation type
+    conv_type: ConvType
+
+    # Data types
+    x_dtype: torch.dtype     # Input dtype
+    w_dtype: torch.dtype     # Weight dtype
+    y_dtype: torch.dtype     # Output dtype
+
+    # Epilogue (only for fprop)
+    epilogue: EpilogueType
+
+    # Random seed for data generation
+    rng_seed: int
+
+    # Computed shapes and strides (set during tensor creation)
+    # Logical order: (N, C, spatial...)
+    x_shape: Tuple[int, ...] = None
+    w_shape: Tuple[int, ...] = None
+    y_shape: Tuple[int, ...] = None
+    x_strides: Tuple[int, ...] = None
+    w_strides: Tuple[int, ...] = None
+    y_strides: Tuple[int, ...] = None
+    x_elems: int = 0
+    w_elems: int = 0
+    y_elems: int = 0
+
+    # Bias tensor info (for epilogue)
+    bias_shape: Tuple[int, ...] = None
+    bias_strides: Tuple[int, ...] = None
+    bias_elems: int = 0
+
+    @property
+    def output_spatial(self) -> Tuple[int, ...]:
+        """Compute output spatial dimensions."""
+        return tuple(
+            compute_output_spatial(inp, flt, pad, strd, dil)
+            for inp, flt, pad, strd, dil in zip(
+                self.input_spatial, self.filter_spatial,
+                self.padding, self.stride, self.dilation
+            )
+        )
+
+    def to_repro_dict(self) -> dict:
+        """Convert config to reproducible dictionary."""
+        return {
+            'spatial_dims': self.spatial_dims,
+            'batch': self.batch,
+            'in_channels': self.in_channels,
+            'out_channels': self.out_channels,
+            'input_spatial': self.input_spatial,
+            'filter_spatial': self.filter_spatial,
+            'padding': self.padding,
+            'stride': self.stride,
+            'dilation': self.dilation,
+            'conv_type': int(self.conv_type),
+            'x_dtype': str(self.x_dtype),
+            'w_dtype': str(self.w_dtype),
+            'y_dtype': str(self.y_dtype),
+            'epilogue': int(self.epilogue),
+            'rng_seed': self.rng_seed,
+        }
+
+
+class ConfigGenerator:
+    """Generator for random convolution configurations."""
+
+    def __init__(self, seed: int, allow_unaligned: bool = False):
+        self.rng = random.Random(seed)
+        self.sm_version = torch.cuda.get_device_capability()[0] * 10 + torch.cuda.get_device_capability()[1]
+        self.allow_unaligned = allow_unaligned
+
+    def random_spatial_dims(self) -> int:
+        """Generate random spatial dimension count (2 or 3)."""
+        return self.rng.choice([2, 2, 2, 3])  # Prefer 2D
+
+    def random_batch(self) -> int:
+        """Generate random batch size."""
+        return self.rng.choice([1, 1, 2, 2, 4, 4, 8])
+
+    def random_channels(self, min_val: int = 1, max_val: int = 256) -> int:
+        """Generate random channel count (reduced max for memory)."""
+        val = self.rng.randint(int(math.sqrt(min_val)), int(math.sqrt(max_val)))
+        val = val * val
+        if self.allow_unaligned:
+            return max(1, val)
+        else:
+            # Round up to multiple of 8 for tensor core alignment
+            return max(8, ((val + 7) // 8) * 8)
+
+    def random_spatial_size(self, min_val: int = 1, max_val: int = 128) -> int:
+        """Generate random spatial dimension size (reduced max for memory)."""
+        val = self.rng.randint(int(math.sqrt(min_val)), int(math.sqrt(max_val)))
+        val = val * val
+        if self.allow_unaligned:
+            return max(1, val)
+        else:
+            return max(8, ((val + 7) // 8) * 8)
+
+    def random_filter_size(self) -> int:
+        """Generate random filter spatial size."""
+        return self.rng.choice([1, 1, 3, 3, 3, 5, 7])
+
+    def random_padding(self, filter_size: int) -> int:
+        """Generate random padding."""
+        # Padding typically 0 to (filter_size - 1) // 2
+        max_pad = (filter_size - 1) // 2
+        return self.rng.randint(0, max(0, max_pad))
+
+    def random_stride(self) -> int:
+        """Generate random stride."""
+        return self.rng.choice([1, 1, 1, 2, 2, 3])
+
+    def random_dilation(self) -> int:
+        """Generate random dilation."""
+        return self.rng.choice([1, 1, 1, 1, 2])
+
+    def random_dtype(self) -> torch.dtype:
+        """Generate random data type."""
+        return self.rng.choice(SUPPORTED_DTYPES)
+
+    def random_conv_type(self) -> ConvType:
+        """Generate random convolution type."""
+        # Weight towards fprop but include dgrad/wgrad
+        weights = [0.5, 0.25, 0.25]  # fprop, dgrad, wgrad
+        return self.rng.choices(list(ConvType), weights=weights)[0]
+
+    def random_epilogue(self) -> EpilogueType:
+        """Generate random epilogue type."""
+        weights = [0.6, 0.15, 0.15, 0.1]
+        return self.rng.choices(list(EpilogueType), weights=weights)[0]
+
+    def generate(self) -> ConvConfig:
+        """Generate a random convolution configuration."""
+        spatial_dims = self.random_spatial_dims()
+
+        batch = self.random_batch()
+        in_channels = self.random_channels()
+        out_channels = self.random_channels()
+
+        # Generate spatial dimensions
+        input_spatial = tuple(self.random_spatial_size() for _ in range(spatial_dims))
+        filter_spatial = tuple(self.random_filter_size() for _ in range(spatial_dims))
+
+        # Ensure output spatial dims are positive
+        padding = []
+        stride = []
+        dilation = []
+        for i in range(spatial_dims):
+            flt = filter_spatial[i]
+            dil = self.random_dilation()
+            strd = self.random_stride()
+            pad = self.random_padding(flt)
+
+            # Check output size is positive
+            effective_filter = (flt - 1) * dil + 1
+            out_size = (input_spatial[i] + 2 * pad - effective_filter) // strd + 1
+
+            # If output would be non-positive, adjust padding
+            while out_size < 1:
+                pad += 1
+                out_size = (input_spatial[i] + 2 * pad - effective_filter) // strd + 1
+
+            padding.append(pad)
+            stride.append(strd)
+            dilation.append(dil)
+
+        padding = tuple(padding)
+        stride = tuple(stride)
+        dilation = tuple(dilation)
+
+        # Convolution type
+        conv_type = self.random_conv_type()
+
+        # Data types - ensure compatible combinations
+        # Keep same dtype for all tensors for stability (like test_conv_bias.py)
+        x_dtype = self.random_dtype()
+        w_dtype = x_dtype  # Same dtype for input and weights
+        y_dtype = x_dtype  # Same dtype for output (mixed precision needs special handling)
+
+        # Epilogue only for fprop
+        if conv_type == ConvType.FPROP:
+            epilogue = self.random_epilogue()
+        else:
+            epilogue = EpilogueType.NONE
+
+        config = ConvConfig(
+            spatial_dims=spatial_dims,
+            batch=batch,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            input_spatial=input_spatial,
+            filter_spatial=filter_spatial,
+            padding=padding,
+            stride=stride,
+            dilation=dilation,
+            conv_type=conv_type,
+            x_dtype=x_dtype,
+            w_dtype=w_dtype,
+            y_dtype=y_dtype,
+            epilogue=epilogue,
+            rng_seed=self.rng.randint(0, 2**31 - 1),
+        )
+
+        return config
+
+
+# ============================================================================
+# Test Execution
+# ============================================================================
+
+def create_tensors(config: ConvConfig, rng: random.Random):
+    """
+    Create tensors based on configuration.
+
+    Tensor naming convention (shapes are always the same regardless of conv_type):
+      X: (N, C_in, spatial...)       - input image shape
+      W: (C_out, C_in, filter...)    - weight/filter shape
+      Y: (N, C_out, output_spatial...) - output shape
+
+    Meaning varies by conv_type:
+      FPROP: X=input, W=weights, Y=output       (compute Y from X,W)
+      DGRAD: X=dX(output), W=weights, Y=dY(input)  (compute dX from dY,W)
+      WGRAD: X=input, W=dW(output), Y=dY(input)    (compute dW from X,dY)
+    """
+    torch_rng = torch.Generator(device='cuda')
+    torch_rng.manual_seed(config.rng_seed)
+
+    # Compute shapes (same for all conv types)
+    x_shape = (config.batch, config.in_channels) + config.input_spatial
+    w_shape = (config.out_channels, config.in_channels) + config.filter_spatial
+    y_shape = (config.batch, config.out_channels) + config.output_spatial
+
+    # Use PyTorch's native channels_last memory format for proper cuDNN compatibility
+    if config.spatial_dims == 2:
+        memory_format = torch.channels_last
+    else:  # 3D
+        memory_format = torch.channels_last_3d
+
+    # Create tensors - which ones are input (random) vs output (garbage) depends on conv_type
+    # Output tensors are filled with garbage (random + NaNs) to catch bugs where cuDNN
+    # doesn't write all output locations
+    if config.conv_type == ConvType.FPROP:
+        # FPROP: X,W are inputs, Y is output
+        X = torch.empty(x_shape, device='cuda', dtype=config.x_dtype).to(memory_format=memory_format)
+        X.normal_(mean=0.5, std=0.1, generator=torch_rng)
+        W = torch.empty(w_shape, device='cuda', dtype=config.w_dtype).to(memory_format=memory_format)
+        W.normal_(mean=0.5, std=0.1, generator=torch_rng)
+        Y = torch.empty(y_shape, device='cuda', dtype=config.y_dtype).to(memory_format=memory_format)
+        fill_with_garbage(Y)  # Output - fill with garbage
+
+    elif config.conv_type == ConvType.DGRAD:
+        # DGRAD: Y(dY),W are inputs, X(dX) is output
+        Y = torch.empty(y_shape, device='cuda', dtype=config.y_dtype).to(memory_format=memory_format)
+        Y.normal_(mean=0.5, std=0.1, generator=torch_rng)  # dY - gradient from upstream
+        W = torch.empty(w_shape, device='cuda', dtype=config.w_dtype).to(memory_format=memory_format)
+        W.normal_(mean=0.5, std=0.1, generator=torch_rng)  # weights
+        X = torch.empty(x_shape, device='cuda', dtype=config.x_dtype).to(memory_format=memory_format)
+        fill_with_garbage(X)  # dX output - fill with garbage
+
+    else:  # WGRAD
+        # WGRAD: X,Y(dY) are inputs, W(dW) is output
+        X = torch.empty(x_shape, device='cuda', dtype=config.x_dtype).to(memory_format=memory_format)
+        X.normal_(mean=0.5, std=0.1, generator=torch_rng)  # input image
+        Y = torch.empty(y_shape, device='cuda', dtype=config.y_dtype).to(memory_format=memory_format)
+        Y.normal_(mean=0.5, std=0.1, generator=torch_rng)  # dY - gradient from upstream
+        W = torch.empty(w_shape, device='cuda', dtype=config.w_dtype).to(memory_format=memory_format)
+        fill_with_garbage(W)  # dW output - fill with garbage
+
+    # Update config with actual shapes and strides
+    config.x_shape = tuple(X.size())
+    config.w_shape = tuple(W.size())
+    config.y_shape = tuple(Y.size())
+    config.x_strides = tuple(X.stride())
+    config.w_strides = tuple(W.stride())
+    config.y_strides = tuple(Y.stride())
+    config.x_elems = X.numel()
+    config.w_elems = W.numel()
+    config.y_elems = Y.numel()
+
+    # Bias tensor if needed (only for FPROP, shape: 1, K, 1, 1, ... for broadcasting)
+    bias = None
+    if config.conv_type == ConvType.FPROP and config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU]:
+        bias_shape = (1, config.out_channels) + (1,) * config.spatial_dims
+        bias = torch.empty(bias_shape, device='cuda', dtype=config.y_dtype).contiguous()
+        bias.normal_(mean=0.0, std=0.1, generator=torch_rng)
+
+        config.bias_shape = tuple(bias.size())
+        config.bias_strides = tuple(bias.stride())
+        config.bias_elems = bias.numel()
+
+    return X, W, Y, bias
+
+
+def compute_reference(config: ConvConfig, X: torch.Tensor, W: torch.Tensor,
+                      Y: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor:
+    """
+    Compute reference result using PyTorch.
+
+    Convention:
+      FPROP: compute Y = conv(X, W) + bias + relu
+      DGRAD: compute dX from dY(=Y) and W
+      WGRAD: compute dW from X and dY(=Y)
+
+    Returns the tensor that should match the cuDNN output:
+      FPROP -> Y_ref (to compare with Y)
+      DGRAD -> dX_ref (to compare with X)
+      WGRAD -> dW_ref (to compare with W)
+    """
+    compute_dtype = torch.float32
+
+    if config.conv_type == ConvType.FPROP:
+        # FPROP: Y = conv(X, W)
+        X_f = X.to(compute_dtype).contiguous()
+        W_f = W.to(compute_dtype).contiguous()
+
+        try:
+            if config.spatial_dims == 2:
+                ref = torch.nn.functional.conv2d(
+                    X_f, W_f,
+                    padding=config.padding,
+                    stride=config.stride,
+                    dilation=config.dilation
+                )
+            else:
+                ref = torch.nn.functional.conv3d(
+                    X_f, W_f,
+                    padding=config.padding,
+                    stride=config.stride,
+                    dilation=config.dilation
+                )
+
+            # Apply epilogue
+            if bias is not None and config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU]:
+                ref = ref + bias.to(compute_dtype)
+            if config.epilogue in [EpilogueType.RELU, EpilogueType.BIAS_RELU]:
+                ref = torch.relu(ref)
+
+            return ref.to(config.y_dtype)
+        finally:
+            del X_f, W_f
+
+    elif config.conv_type == ConvType.DGRAD:
+        # DGRAD: dX = conv_dgrad(dY, W)
+        # Y contains dY (gradient from upstream), W contains weights
+        # We compute dX (gradient w.r.t. input)
+        dY_f = Y.to(compute_dtype).contiguous()
+        W_f = W.to(compute_dtype).contiguous()
+
+        # Use autograd to compute the reference
+        # Create a dummy input and run forward, then backward to get dX
+        dummy_X = torch.zeros(config.x_shape, device='cuda', dtype=compute_dtype, requires_grad=True)
+
+        try:
+            if config.spatial_dims == 2:
+                dummy_Y = torch.nn.functional.conv2d(
+                    dummy_X, W_f,
+                    padding=config.padding,
+                    stride=config.stride,
+                    dilation=config.dilation
+                )
+            else:
+                dummy_Y = torch.nn.functional.conv3d(
+                    dummy_X, W_f,
+                    padding=config.padding,
+                    stride=config.stride,
+                    dilation=config.dilation
+                )
+
+            # Backward pass to get dX
+            dummy_Y.backward(dY_f)
+            dX_ref = dummy_X.grad.clone()
+
+            return dX_ref.to(config.x_dtype)
+        finally:
+            del dY_f, W_f, dummy_X, dummy_Y
+
+    else:  # WGRAD
+        # WGRAD: dW = conv_wgrad(X, dY)
+        # X contains input, Y contains dY (gradient from upstream)
+        # We compute dW (gradient w.r.t. weights)
+        X_f = X.to(compute_dtype).contiguous()
+        dY_f = Y.to(compute_dtype).contiguous()
+
+        # Use autograd to compute the reference
+        # Create a dummy weight and run forward, then backward to get dW
+        dummy_W = torch.zeros(config.w_shape, device='cuda', dtype=compute_dtype, requires_grad=True)
+
+        try:
+            if config.spatial_dims == 2:
+                dummy_Y = torch.nn.functional.conv2d(
+                    X_f, dummy_W,
+                    padding=config.padding,
+                    stride=config.stride,
+                    dilation=config.dilation
+                )
+            else:
+                dummy_Y = torch.nn.functional.conv3d(
+                    X_f, dummy_W,
+                    padding=config.padding,
+                    stride=config.stride,
+                    dilation=config.dilation
+                )
+
+            # Backward pass to get dW
+            dummy_Y.backward(dY_f)
+            dW_ref = dummy_W.grad.clone()
+
+            return dW_ref.to(config.w_dtype)
+        finally:
+            del X_f, dY_f, dummy_W, dummy_Y
+
+
+def run_cudnn_conv(config: ConvConfig, X: torch.Tensor, W: torch.Tensor, Y: torch.Tensor,
+                   bias: Optional[torch.Tensor], cudnn_handle) -> Tuple[bool, str]:
+    """
+    Run convolution using cuDNN and return success status and message.
+
+    Convention:
+      FPROP: inputs=X,W, output=Y     (compute Y)
+      DGRAD: inputs=Y(dY),W, output=X (compute dX into X)
+      WGRAD: inputs=X,Y(dY), output=W (compute dW into W)
+    """
+    try:
+        stream = torch.cuda.current_stream().cuda_stream
+        cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+        # Determine compute and IO data types
+        if config.x_dtype == torch.float32:
+            io_dtype = cudnn.data_type.FLOAT
+        elif config.x_dtype == torch.bfloat16:
+            io_dtype = cudnn.data_type.BFLOAT16
+        else:
+            io_dtype = cudnn.data_type.HALF
+
+        # Create graph
+        graph = cudnn.pygraph(
+            handle=cudnn_handle,
+            io_data_type=io_dtype,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+        )
+
+        # Build convolution operation based on conv_type
+        if config.conv_type == ConvType.FPROP:
+            # FPROP: Y = conv(X, W)
+            X_tensor = graph.tensor(
+                name="X", dim=list(X.size()), stride=list(X.stride()),
+                data_type=convert_to_cudnn_type(config.x_dtype)
+            )
+            W_tensor = graph.tensor(
+                name="W", dim=list(W.size()), stride=list(W.stride()),
+                data_type=convert_to_cudnn_type(config.w_dtype)
+            )
+
+            conv_output = graph.conv_fprop(
+                image=X_tensor,
+                weight=W_tensor,
+                padding=list(config.padding),
+                stride=list(config.stride),
+                dilation=list(config.dilation),
+            )
+
+            # Apply epilogue
+            if config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU]:
+                B_tensor = graph.tensor(
+                    name="B", dim=list(bias.size()), stride=list(bias.stride()),
+                    data_type=convert_to_cudnn_type(config.y_dtype)
+                )
+                conv_output = graph.bias(name="bias", input=conv_output, bias=B_tensor)
+
+            if config.epilogue in [EpilogueType.RELU, EpilogueType.BIAS_RELU]:
+                conv_output = graph.relu(name="relu", input=conv_output)
+
+            conv_output.set_output(True)
+
+            # Execution dict: X,W are inputs, Y is output
+            exec_dict = {X_tensor: X, W_tensor: W, conv_output: Y}
+            if config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU]:
+                exec_dict[B_tensor] = bias
+
+        elif config.conv_type == ConvType.DGRAD:
+            # DGRAD: dX = conv_dgrad(dY, W)
+            # Y contains dY (input), W contains weights, X is where we store dX (output)
+            dY_tensor = graph.tensor(
+                name="dY", dim=list(Y.size()), stride=list(Y.stride()),
+                data_type=convert_to_cudnn_type(config.y_dtype)
+            )
+            W_tensor = graph.tensor(
+                name="W", dim=list(W.size()), stride=list(W.stride()),
+                data_type=convert_to_cudnn_type(config.w_dtype)
+            )
+
+            conv_output = graph.conv_dgrad(
+                loss=dY_tensor,
+                filter=W_tensor,
+                padding=list(config.padding),
+                stride=list(config.stride),
+                dilation=list(config.dilation),
+            )
+            # Must set output dimensions explicitly for dgrad (cuDNN can't infer them)
+            conv_output.set_output(True).set_dim(list(X.size())).set_stride(list(X.stride()))
+
+            # Execution dict: Y(dY),W are inputs, X(dX) is output
+            exec_dict = {dY_tensor: Y, W_tensor: W, conv_output: X}
+
+        else:  # WGRAD
+            # WGRAD: dW = conv_wgrad(X, dY)
+            # X contains input, Y contains dY, W is where we store dW (output)
+            X_tensor = graph.tensor(
+                name="X", dim=list(X.size()), stride=list(X.stride()),
+                data_type=convert_to_cudnn_type(config.x_dtype)
+            )
+            dY_tensor = graph.tensor(
+                name="dY", dim=list(Y.size()), stride=list(Y.stride()),
+                data_type=convert_to_cudnn_type(config.y_dtype)
+            )
+
+            conv_output = graph.conv_wgrad(
+                image=X_tensor,
+                loss=dY_tensor,
+                padding=list(config.padding),
+                stride=list(config.stride),
+                dilation=list(config.dilation),
+            )
+            conv_output.set_output(True).set_dim(list(W.size())).set_stride(list(W.stride()))
+
+            # Execution dict: X,Y(dY) are inputs, W(dW) is output
+            exec_dict = {X_tensor: X, dY_tensor: Y, conv_output: W}
+
+        # Validate and build
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+
+        # Allocate workspace and fill with garbage to catch uninitialized memory bugs
+        workspace_size = graph.get_workspace_size()
+        workspace = torch.empty(workspace_size, device='cuda', dtype=torch.uint8)
+        if workspace_size > 0:
+            # Fill with random garbage + some NaN patterns to test proper workspace init
+            workspace.random_(0, 256)
+            # Sprinkle in NaN bit patterns (0x7FC00000 for float32 NaN)
+            nan_mask = torch.rand(workspace_size, device='cuda') < 0.1
+            workspace[nan_mask] = 0xFF
+
+        graph.execute(exec_dict, workspace, handle=cudnn_handle)
+        torch.cuda.synchronize()
+
+        return True, "Success"
+
+    except cudnn.cudnnGraphNotSupportedError as e:
+        return False, f"Graph not supported: {e}"
+    except Exception as e:
+        return False, f"Error: {e}"
+
+
+def compare_results(actual: torch.Tensor, ref: torch.Tensor, _dtype: torch.dtype,
+                    num_diffs: int = 10) -> Tuple[bool, str]:
+    """Compare cuDNN result with reference."""
+    # Base tolerances - TF32/FP16/BF16 all have similar effective precision
+    # (TF32 and FP16 have 10-bit mantissa, BF16 has 7-bit but we use same tolerance)
+    # cuDNN uses TF32 for FP32 tensor core ops
+    # _dtype kept for potential future per-dtype tolerance tuning
+    rtol, atol = 1e-2, 1e-2
+
+    if ref.shape != actual.shape:
+        return False, f"Shape mismatch: actual={actual.shape}, ref={ref.shape}"
+
+    # Compare
+    actual_f = actual.to(torch.float32).contiguous()
+    ref_f = ref.to(torch.float32).contiguous()
+
+    diff = torch.abs(actual_f - ref_f)
+    max_diff = diff.max().item()
+    mean_diff = diff.mean().item()
+
+    # Relative difference
+    denom = torch.maximum(torch.abs(ref_f), torch.tensor(1e-6, device='cuda'))
+    rel_diff = diff / denom
+    max_rel_diff = rel_diff.max().item()
+
+    # Find mismatches - element fails if it exceeds BOTH tolerances
+    # mismatch_mask = (diff > atol) & (rel_diff > rtol)
+    mismatch_mask =  (diff > torch.abs(atol + rtol * ref_f))
+    mismatch_indices = torch.nonzero(mismatch_mask)
+    num_mismatches = mismatch_indices.shape[0]
+
+    # Pass if no elements fail both tolerance checks
+    passed = num_mismatches == 0
+
+    if passed:
+        return True, f"max_diff={max_diff:.2e}, mean_diff={mean_diff:.2e}, max_rel_diff={max_rel_diff:.2e}"
+    else:
+        msg = f"MISMATCH: {num_mismatches} elements differ (max_diff={max_diff:.2e}, max_rel_diff={max_rel_diff:.2e})\n"
+        for i in range(min(num_diffs, num_mismatches)):
+            idx = tuple(mismatch_indices[i].tolist())
+            act_val = actual_f[idx].item()
+            ref_val = ref_f[idx].item()
+            d = diff[idx].item()
+            msg += f"  [{idx}]: actual={act_val:.6f}, expected={ref_val:.6f}, diff={d:.2e} tol={atol + rtol * ref_f[idx].item():.2e}\n"
+
+        return False, msg
+
+
+def estimate_memory_mb(config: ConvConfig) -> float:
+    """Estimate GPU memory usage in MB for tensors (X, W, Y, Y_ref, bias)."""
+    dtype_bytes = {
+        torch.float32: 4,
+        torch.float16: 2,
+        torch.bfloat16: 2,
+        torch.int8: 1,
+    }
+    elem_size = dtype_bytes.get(config.x_dtype, 4)
+
+    # X, W, Y tensors + Y_ref (float32 for comparison)
+    x_bytes = config.x_elems * elem_size
+    w_bytes = config.w_elems * elem_size
+    y_bytes = config.y_elems * elem_size
+    y_ref_bytes = config.y_elems * 4  # float32
+
+    total = x_bytes + w_bytes + y_bytes + y_ref_bytes
+    if config.bias_elems:
+        total += config.bias_elems * elem_size
+
+    return total / (1024 * 1024)
+
+
+def format_test_header(config: ConvConfig, test_num: int, total_tests: int, test_name: str) -> str:
+    """Format test header similar to matmul fuzzer."""
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    gpu_info = f"{get_gpu_arch()} ({get_sm_count()} SM-s, {get_gpu_name()})"
+
+    spatial_str = "2D" if config.spatial_dims == 2 else "3D"
+    mem_mb = estimate_memory_mb(config)
+
+    lines = [
+        "",  # Newline to separate from pytest's test name line
+        "=" * 90,
+        f"#### Test #{test_num} of {total_tests} at {timestamp} ",
+        "",
+        f"test_name        = {test_name}",
+        f"platform_info    = {gpu_info}, cudnn_ver={cudnn.backend_version()}",
+        f"rng_data_seed    = {config.rng_seed}",
+        f"conv_type        = {conv_type_name(config.conv_type)} ({spatial_str})",
+        f"basic_dims       = [N={config.batch}, C_in={config.in_channels}, C_out={config.out_channels}]",
+        f"input_spatial    = {config.input_spatial}",
+        f"filter_spatial   = {config.filter_spatial}",
+        f"output_spatial   = {config.output_spatial}",
+        f"padding          = {config.padding}",
+        f"stride           = {config.stride}",
+        f"dilation         = {config.dilation}",
+        f"x(N,C,spatial)   = dim={config.x_shape}, strides={config.x_strides}, elems={config.x_elems}, type={config.x_dtype}",
+        f"w(K,C,spatial)   = dim={config.w_shape}, strides={config.w_strides}, elems={config.w_elems}, type={config.w_dtype}",
+        f"y(N,K,spatial)   = dim={config.y_shape}, strides={config.y_strides}, elems={config.y_elems}, type={config.y_dtype}",
+    ]
+
+    if config.bias_shape:
+        lines.append(f"bias(1,K,1...)   = dim={config.bias_shape}, strides={config.bias_strides}, elems={config.bias_elems}, type={config.y_dtype}")
+
+    lines.extend([
+        f"epilogue         = {epilogue_name(config.epilogue)}",
+        f"est_memory       = {mem_mb:.1f} MB",
+        f"repro_cmd        = pytest -vv -s -rA {__file__}::test_repro --repro \"{config.to_repro_dict()}\"",
+        " ",
+    ])
+
+    return "\n".join(lines)
+
+
+# ============================================================================
+# Pytest Fixtures and Configuration
+# ============================================================================
+# Note: pytest_addoption is defined in conftest.py
+# Options used: --seed, --num-tests, --diffs, --repro
+
+@pytest.fixture
+def num_diffs(request):
+    return request.config.getoption("--diffs")
+
+
+# ============================================================================
+# Test Parameter Generation
+# ============================================================================
+
+def tlist_with_configs(*, num_tests: int, rng_seed: int, allow_unaligned: bool = False):
+    """Generate list of test parameters with pre-generated configs for descriptive test names."""
+    rng = random.Random(rng_seed)
+    params = []
+    for i in range(num_tests):
+        config_seed = rng.randint(65536, 2**31 - 1)
+        generator = ConfigGenerator(config_seed, allow_unaligned=allow_unaligned)
+        config = generator.generate()
+        params.append((i + 1, num_tests, config_seed, config))
+    return params
+
+
+def make_test_id(param, prefix: str = "t"):
+    """Create descriptive test ID from pre-generated config."""
+    test_num, total_tests, config_seed, config = param
+    dtype_short = {
+        torch.float16: 'f16',
+        torch.bfloat16: 'bf16',
+        torch.float32: 'f32',
+    }
+    dt = dtype_short.get(config.x_dtype, 'unk')
+    spatial = '2d' if config.spatial_dims == 2 else '3d'
+    conv = conv_type_name(config.conv_type)[:2]  # fp, dg, wg
+    epi = epilogue_name(config.epilogue)[:4]
+    # Example: t1_N2_C64x128_32x32_f16_2d_fp_none
+    spatial_str = 'x'.join(str(s) for s in config.input_spatial)
+    return f"{prefix}{test_num}_N{config.batch}_C{config.in_channels}x{config.out_channels}_{spatial_str}_{dt}_{spatial}_{conv}_{epi}"
+
+
+# Pre-generated test parameter lists
+DEFAULT_NUM_TESTS = 1024
+DEFAULT_SEED_L0 = 42
+DEFAULT_SEED_L1 = 12345
+
+TEST_PARAMS_L0 = tlist_with_configs(num_tests=DEFAULT_NUM_TESTS, rng_seed=DEFAULT_SEED_L0, allow_unaligned=False)
+TEST_PARAMS_L1 = tlist_with_configs(num_tests=DEFAULT_NUM_TESTS, rng_seed=DEFAULT_SEED_L1, allow_unaligned=True)
+
+SKIP_TEST_NUMS_L0 = {}
+
+# ============================================================================
+# Test Functions
+# ============================================================================
+
+@pytest.mark.L0
+@pytest.mark.parametrize("test_num,total_tests,config_seed,config", TEST_PARAMS_L0,
+                        ids=[make_test_id(p) for p in TEST_PARAMS_L0])
+def test_conv_random_L0_0(test_num: int, total_tests: int, config_seed: int, config: ConvConfig, cudnn_handle, num_diffs, request):
+    """Random convolution tests (fprop/dgrad/wgrad) with aligned dimensions (L0)."""
+    # Skip known failing tests
+    if test_num in SKIP_TEST_NUMS_L0:
+        pytest.skip(f"Known failing test (dgrad f32 precision issue)")
+
+    # Create tensors
+    rng = random.Random(config_seed)
+    X, W, Y, bias = create_tensors(config, rng)
+    ref = None
+
+    try:
+        # Print test header
+        test_name = f"test_conv_random_L0_0[{make_test_id((test_num, total_tests, config_seed, config))}]"
+        print(format_test_header(config, test_num, total_tests, test_name))
+
+        # Run cuDNN
+        success, msg = run_cudnn_conv(config, X, W, Y, bias, cudnn_handle)
+
+        if not success:
+            print(f"%%%% cuDNN execution failed: {msg}")
+            pytest.skip(f"cuDNN not supported: {msg}")
+            return
+
+        # Compute reference and compare
+        ref = compute_reference(config, X, W, Y, bias)
+
+        # Determine which tensor to compare based on conv_type
+        if config.conv_type == ConvType.FPROP:
+            actual, dtype, name = Y, config.y_dtype, "Y"
+        elif config.conv_type == ConvType.DGRAD:
+            actual, dtype, name = X, config.x_dtype, "dX"
+        else:  # WGRAD
+            actual, dtype, name = W, config.w_dtype, "dW"
+
+        passed, compare_msg = compare_results(actual, ref, dtype, num_diffs)
+
+        if passed:
+            print(f"%%%% Numerical divergence of '{name}' within limits ({compare_msg})")
+            print("@@@@ Overall result: PASSED, everything looks good!")
+        else:
+            print(f"%%%% {compare_msg}")
+            print("@@@@ Overall result: FAILED, numerical mismatch!")
+            pytest.fail(f"Numerical mismatch: {compare_msg}")
+    finally:
+        # Explicit cleanup to prevent GPU memory accumulation
+        del X, W, Y
+        if bias is not None:
+            del bias
+        if ref is not None:
+            del ref
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.L1
+@pytest.mark.parametrize("test_num,total_tests,config_seed,config", TEST_PARAMS_L1,
+                        ids=[make_test_id(p, prefix="u") for p in TEST_PARAMS_L1])
+def test_conv_random_L0_1(test_num: int, total_tests: int, config_seed: int, config: ConvConfig, cudnn_handle, num_diffs, request):
+    """Random convolution tests (fprop/dgrad/wgrad) with unaligned dimensions (L1)."""
+    # Create tensors
+    rng = random.Random(config_seed)
+    X, W, Y, bias = create_tensors(config, rng)
+    ref = None
+
+    try:
+        # Print test header
+        test_name = f"test_conv_random_L0_1[{make_test_id((test_num, total_tests, config_seed, config), prefix='u')}]"
+        print(format_test_header(config, test_num, total_tests, test_name))
+
+        # Run cuDNN
+        success, msg = run_cudnn_conv(config, X, W, Y, bias, cudnn_handle)
+
+        if not success:
+            print(f"%%%% cuDNN execution failed: {msg}")
+            pytest.skip(f"cuDNN not supported: {msg}")
+            return
+
+        # Compute reference and compare
+        ref = compute_reference(config, X, W, Y, bias)
+
+        # Determine which tensor to compare based on conv_type
+        if config.conv_type == ConvType.FPROP:
+            actual, dtype, name = Y, config.y_dtype, "Y"
+        elif config.conv_type == ConvType.DGRAD:
+            actual, dtype, name = X, config.x_dtype, "dX"
+        else:  # WGRAD
+            actual, dtype, name = W, config.w_dtype, "dW"
+
+        passed, compare_msg = compare_results(actual, ref, dtype, num_diffs)
+
+        if passed:
+            print(f"%%%% Numerical divergence of '{name}' within limits ({compare_msg})")
+            print("@@@@ Overall result: PASSED, everything looks good!")
+        else:
+            print(f"%%%% {compare_msg}")
+            print("@@@@ Overall result: FAILED, numerical mismatch!")
+            pytest.fail(f"Numerical mismatch: {compare_msg}")
+    finally:
+        # Explicit cleanup to prevent GPU memory accumulation
+        del X, W, Y
+        if bias is not None:
+            del bias
+        if ref is not None:
+            del ref
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.L0
+def test_repro(cudnn_handle, num_diffs, request):
+    """Reproduce a specific test case from repro dict."""
+    repro_str = request.config.getoption("--repro")
+    if repro_str is None:
+        pytest.skip("No --repro argument provided")
+        return
+
+    import ast
+    repro = ast.literal_eval(repro_str)
+
+    # Reconstruct config from repro dict
+    dtype_map = {
+        'torch.float16': torch.float16,
+        'torch.bfloat16': torch.bfloat16,
+        'torch.float32': torch.float32,
+        'torch.int8': torch.int8,
+    }
+
+    config = ConvConfig(
+        spatial_dims=repro['spatial_dims'],
+        batch=repro['batch'],
+        in_channels=repro['in_channels'],
+        out_channels=repro['out_channels'],
+        input_spatial=tuple(repro['input_spatial']),
+        filter_spatial=tuple(repro['filter_spatial']),
+        padding=tuple(repro['padding']),
+        stride=tuple(repro['stride']),
+        dilation=tuple(repro['dilation']),
+        conv_type=ConvType(repro['conv_type']),
+        x_dtype=dtype_map[repro['x_dtype']],
+        w_dtype=dtype_map[repro['w_dtype']],
+        y_dtype=dtype_map[repro['y_dtype']],
+        epilogue=EpilogueType(repro['epilogue']),
+        rng_seed=repro['rng_seed'],
+    )
+
+    # Create tensors
+    rng = random.Random(config.rng_seed)
+    X, W, Y, bias = create_tensors(config, rng)
+    ref = None
+
+    try:
+        # Print test header
+        print(format_test_header(config, 1, 1, "test_repro"))
+
+        # Run cuDNN
+        success, msg = run_cudnn_conv(config, X, W, Y, bias, cudnn_handle)
+
+        if not success:
+            print(f"%%%% cuDNN execution failed: {msg}")
+            pytest.fail(f"cuDNN failed: {msg}")
+            return
+
+        # Compute reference and compare
+        ref = compute_reference(config, X, W, Y, bias)
+
+        # Determine which tensor to compare based on conv_type
+        if config.conv_type == ConvType.FPROP:
+            actual, dtype, name = Y, config.y_dtype, "Y"
+        elif config.conv_type == ConvType.DGRAD:
+            actual, dtype, name = X, config.x_dtype, "dX"
+        else:  # WGRAD
+            actual, dtype, name = W, config.w_dtype, "dW"
+
+        passed, compare_msg = compare_results(actual, ref, dtype, num_diffs)
+
+        if passed:
+            print(f"%%%% Numerical divergence of '{name}' within limits ({compare_msg})")
+            print("@@@@ Overall result: PASSED, everything looks good!")
+        else:
+            print(f"%%%% {compare_msg}")
+            print("@@@@ Overall result: FAILED, numerical mismatch!")
+            pytest.fail(f"Numerical mismatch: {compare_msg}")
+    finally:
+        # Explicit cleanup to prevent GPU memory accumulation
+        del X, W, Y
+        if bias is not None:
+            del bias
+        if ref is not None:
+            del ref
+        torch.cuda.empty_cache()
diff --git a/third_party/cudnn-frontend/test/python/test_conv_genstats.py b/third_party/cudnn-frontend/test/python/test_conv_genstats.py
new file mode 100644
index 00000000..dd3d0a44
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_conv_genstats.py
@@ -0,0 +1,106 @@
+import cudnn
+import pytest
+import torch
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+
+class Conv_Genstats(torch.nn.Module):
+    def forward(self, scale, bias, x, w, padding=[1, 1], stride=[1, 1], dilation=[1, 1]):
+        x_conv = torch.relu(x * scale + bias)
+        conv_output = torch.nn.functional.conv2d(x_conv, w, padding=padding, stride=stride, dilation=dilation)
+        sum = torch.sum(conv_output, dim=(0, 2, 3), dtype=torch.float32)
+        sq_sum = torch.sum(torch.square(conv_output), dim=(0, 2, 3), dtype=torch.float32)
+        return conv_output, sum, sq_sum
+
+
+model = Conv_Genstats().eval().to("cuda")
+
+n = 4
+c = 32
+k = 64
+padding = [1, 1]
+stride = [1, 1]
+dilation = [1, 1]
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.8",
+    reason="requires cudnn 8.8 or higher",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_conv_genstats(cudnn_handle):
+
+    # Reference
+    X_gpu = torch.randn(n, c, 32, 32, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(k, c, 3, 3, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    scale = torch.randn(1, c, 1, 1, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) * 0.01
+    bias = torch.randn(1, c, 1, 1, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) * 0.01
+    Y_expected, sum_expected, sq_sum_expected = model(scale, bias, X_gpu, W_gpu, padding=padding, stride=stride, dilation=dilation)
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Cudnn code
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.HALF,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor(name="X", dim=X_gpu.size(), stride=X_gpu.stride(), data_type=X_gpu.dtype)
+    W = graph.tensor(name="W", dim=W_gpu.size(), stride=W_gpu.stride(), data_type=W_gpu.dtype)
+
+    S = graph.tensor(name="S", dim=scale.size(), stride=scale.stride(), data_type=scale.dtype)
+    B = graph.tensor(name="B", dim=bias.size(), stride=bias.stride(), data_type=bias.dtype)
+
+    S_OUT = graph.scale(name="scale", input=X, scale=S)
+    B_OUT = graph.bias(name="bias", input=S_OUT, bias=B)
+    CONV_IN = graph.relu(name="relu", input=B_OUT)
+    Y = graph.conv_fprop(image=CONV_IN, weight=W, padding=padding, stride=stride, dilation=dilation)
+    Y.set_output(True)
+
+    SUM, SQ_SUM = graph.genstats(name="genstats", input=Y)
+    SUM.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+    SQ_SUM.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans()
+
+    sum_dev = torch.zeros_like(sum_expected)
+    sq_sum_dev = torch.zeros_like(sq_sum_expected)
+    Y_actual = torch.zeros_like(Y_expected)
+
+    # Below tests capability to run with just device pointers
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute(
+        {
+            X: X_gpu.data_ptr(),
+            W: W_gpu,
+            Y: Y_actual.data_ptr(),
+            SUM: sum_dev,
+            SQ_SUM: sq_sum_dev,
+            S: scale,
+            B: bias,
+        },
+        workspace.data_ptr(),
+        handle=cudnn_handle,
+    )
+
+    # Compare
+    torch.cuda.synchronize()
+    torch.testing.assert_close(sum_expected, sum_dev, atol=0.5, rtol=1e-2)
+    torch.testing.assert_close(sq_sum_expected, sq_sum_dev, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
diff --git a/third_party/cudnn-frontend/test/python/test_conv_reduction.py b/third_party/cudnn-frontend/test/python/test_conv_reduction.py
new file mode 100644
index 00000000..71872f4b
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_conv_reduction.py
@@ -0,0 +1,87 @@
+import cudnn
+import pytest
+import torch
+from functools import lru_cache, wraps
+import functools
+
+from test_utils import torch_fork_set_rng
+
+
+def conv_reduce_cache_key(handle, X_gpu, W_gpu):
+    """Custom key function for conv_reduce_graph"""
+    return (
+        tuple(X_gpu.shape),
+        tuple(X_gpu.stride()),
+        tuple(W_gpu.shape),
+        tuple(W_gpu.stride()),
+        X_gpu.dtype,
+        W_gpu.dtype,
+    )
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.B])
+@cudnn.graph_cache(key_fn=conv_reduce_cache_key)
+def create_conv_reduce_graph(handle, X_gpu, W_gpu):
+    with cudnn.graph(handle) as (g, _):
+        print(f"Creating graph with X_gpu shape: {X_gpu.shape} and W_gpu shape: {W_gpu.shape}")
+        X = g.tensor_like(X_gpu)
+        W = g.tensor_like(W_gpu)
+        Y_conv = g.conv_fprop(X, W, padding=[1, 1], stride=[1, 1], dilation=[1, 1])
+        Y = g.reduction(Y_conv, mode=cudnn.reduction_mode.ADD)
+        n, _, h, w = X.get_dim()
+        Y.set_output(True).set_dim([n, 1, h, w]).set_data_type(cudnn.data_type.FLOAT)
+    return g, [X, W, Y]  # Return raw graph and tensors
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_reduction(cudnn_handle):
+
+    # Define tensor dimensions
+    N, K, C, H, W = 4, 32, 16, 64, 64
+    R, S = 3, 3
+    padding = stride = dilation = [1, 1]
+
+    # Reference
+    X_gpu = torch.randn(N, C, H, W, dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(K, C, R, S, dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+    # Perform convolution using FP32 computation while input and filter remain in FP16
+    with torch.amp.autocast("cuda", dtype=torch.float32):
+        conv_output = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=padding, stride=stride, dilation=dilation)
+        Y_expected = conv_output.sum(dim=1)
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    g, uids = create_conv_reduce_graph(cudnn_handle, X_gpu, W_gpu)
+
+    print(f"Graph created with UIDs: {uids}")
+
+    X_uid, W_uid, Y_uid = uids
+
+    X_gpu_2 = torch.randn(N, C, H, W, dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+    W_gpu_2 = torch.randn(K, C, R, S, dtype=torch.float16, device="cuda").to(memory_format=torch.channels_last)
+
+    g2, uids2 = create_conv_reduce_graph(cudnn_handle, X_gpu_2, W_gpu_2)
+
+    X_uid2, W_uid2, Y_uid2 = uids2
+
+    assert X_uid == X_uid2
+    assert W_uid == W_uid2
+    assert Y_uid == Y_uid2
+    assert g == g2
+
+    Y_actual = torch.zeros_like(Y_expected)
+    Y_actual_2 = torch.zeros_like(Y_expected)
+
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    g.execute({X_uid: X_gpu, W_uid: W_gpu, Y_uid: Y_actual}, workspace, handle=cudnn_handle)
+
+    # g.execute(
+    #     {X_uid: X_gpu_2, W_uid: W_gpu_2, Y_uid: Y_actual_2}, workspace, handle=cudnn_handle
+    # )
+
+    torch.cuda.synchronize()
+    # Compare
+    torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
diff --git a/third_party/cudnn-frontend/test/python/test_deviceless_aot_compilation.py b/third_party/cudnn-frontend/test/python/test_deviceless_aot_compilation.py
new file mode 100644
index 00000000..8e23d97b
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_deviceless_aot_compilation.py
@@ -0,0 +1,94 @@
+import cudnn
+import pytest
+import torch
+from looseversion import LooseVersion
+
+"""
+Test suite for DeviceProperties functionality in cuDNN Frontend.
+Tests deviceless AoT compilation.
+"""
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.11",
+    reason="requires cudnn 9.11 or higher",
+)
+@pytest.mark.L0
+def test_device_properties():
+    # Step 1
+    # Create original device properties and initialize it with device 0
+    device_props_original = cudnn.create_device_properties(0)
+
+    # Serialize
+    json_str_original = device_props_original.serialize()
+
+    # Deserialize a new object
+    device_props_deserialized = cudnn.create_device_properties(json_str_original)
+
+    # Serialize the deserialized object
+    json_str_deserialized = device_props_deserialized.serialize()
+    # print(f"Device properties: {json_str_deserialized}")
+
+    # Verify the serialized JSON strings are identical
+    assert json_str_original == json_str_deserialized
+
+    # Step 2
+    # Create a conv graph with the deserialized device properties
+    N, K, C, H, W, R, S = 16, 256, 128, 64, 64, 3, 3
+    padding = stride = dilation = [1, 1]
+
+    # NHWC layout
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.FLOAT,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        device_property=device_props_deserialized,
+    )
+    X_tensor = graph.tensor(
+        name="X",
+        dim=[N, C, H, W],
+        stride=[C * H * W, 1, C * W, C],
+    )
+    W_tensor = graph.tensor(
+        name="W",
+        dim=[K, C, R, S],
+        stride=[C * R * S, 1, C * S, C],
+    )
+    Y_tensor = graph.conv_fprop(X_tensor, W_tensor, padding=padding, stride=stride, dilation=dilation)
+    Y_tensor.set_output(True)
+
+    graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    json_str = graph.serialize()
+
+    # Step 3
+    # Compute reference
+    X_gpu = torch.randn(N, C, H, W, dtype=torch.float32, device="cuda").to(memory_format=torch.channels_last)
+    W_gpu = torch.randn(K, C, R, S, dtype=torch.float32, device="cuda").to(memory_format=torch.channels_last)
+    with torch.amp.autocast(device_type="cuda", dtype=torch.float32):
+        Y_ref = torch.nn.functional.conv2d(X_gpu, W_gpu, padding=padding, stride=stride, dilation=dilation)
+
+    # Create handle only when needed (for graph execution)
+    cudnn_handle = cudnn.create_handle()
+    try:
+        stream = torch.cuda.current_stream().cuda_stream
+        cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+        graph_deserialized = cudnn.pygraph()
+        graph_deserialized.deserialize(cudnn_handle, json_str)
+
+        Y_actual = torch.zeros_like(Y_ref)
+
+        workspace = torch.empty(graph_deserialized.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+        graph_deserialized.execute(
+            {X_tensor: X_gpu, W_tensor: W_gpu, Y_tensor: Y_actual},
+            workspace,
+            handle=cudnn_handle,
+        )
+
+        torch.cuda.synchronize()
+
+        # Compare
+        torch.testing.assert_close(Y_ref, Y_actual, atol=1e-3, rtol=1e-3)
+    finally:
+        cudnn.destroy_handle(cudnn_handle)
diff --git a/third_party/cudnn-frontend/test/python/test_flexible_sdpa.py b/third_party/cudnn-frontend/test/python/test_flexible_sdpa.py
new file mode 100644
index 00000000..8165641d
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_flexible_sdpa.py
@@ -0,0 +1,599 @@
+import cudnn
+import torch
+import pytest
+from functools import partial
+import math
+
+from test_utils import torch_fork_set_rng
+from looseversion import LooseVersion
+
+
+# Helper function to create a non contiguous container in blocks of block_size from a contiguous tensor
+def create_container_and_page_table(tensor, block_size):
+    B, H, S, D = tensor.shape
+    # num_blocks = math.ceil(S/block_size) * B
+    blocks_per_batch = math.ceil(S / block_size)
+
+    # Only needed if S is not a multiple of block_size
+    padding_seq = (blocks_per_batch * block_size) - S
+    if padding_seq > 0:
+        zeros = torch.zeros(B, H, padding_seq, D, device="cuda", dtype=tensor.dtype)
+        cat_tensor = torch.cat((tensor, zeros), axis=2)
+    else:
+        cat_tensor = tensor
+
+    # Create a container by splitting on the S dimension and concatenating at the block dimension
+    # Its dimensions are [num_blocks, H, block_size, D] with num_blocks = B * blocks_per_batch
+    container = torch.cat((cat_tensor.clone()).chunk(blocks_per_batch, dim=2), dim=0)
+
+    # Create the page table
+    table_size = math.ceil(S / block_size)
+    page_table_temp = torch.linspace(0, B * table_size - 1, B * table_size, device="cuda", dtype=torch.int32).reshape(table_size, 1, B, 1)
+    page_table_temp = torch.transpose(page_table_temp, 0, 2)
+
+    # Make batch size outer dimension (cuDNN backend requirement)
+    page_table = torch.randn(blocks_per_batch * B).int().cuda().as_strided((B, 1, blocks_per_batch, 1), (blocks_per_batch, blocks_per_batch, 1, 1))
+    page_table.copy_(page_table_temp)
+
+    return (container, page_table)
+
+
+def padding_mask(sdpa_graph, q_kt_tensor, seq_len_q, seq_len_kv, neg_inf):
+    row_index = sdpa_graph.gen_index(
+        input=q_kt_tensor,
+        axis=2,
+        name="row_index",
+        compute_data_type=cudnn.data_type.INT32,
+    )
+    row_index.set_data_type(cudnn.data_type.INT32)
+
+    col_index = sdpa_graph.gen_index(
+        input=q_kt_tensor,
+        axis=3,
+        name="col_index",
+        compute_data_type=cudnn.data_type.INT32,
+    )
+    col_index.set_data_type(cudnn.data_type.INT32)
+
+    row_mask = sdpa_graph.cmp_ge(
+        input=row_index,
+        comparison=seq_len_q,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="row_mask",
+    )
+    row_mask.set_data_type(cudnn.data_type.BOOLEAN)
+
+    col_mask = sdpa_graph.cmp_ge(
+        input=col_index,
+        comparison=seq_len_kv,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="col_mask",
+    )
+    col_mask.set_data_type(cudnn.data_type.BOOLEAN)
+
+    padding_mask = sdpa_graph.logical_and(
+        a=row_mask,
+        b=col_mask,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="padding_mask",
+    )
+    padding_mask.set_data_type(cudnn.data_type.BOOLEAN)
+
+    out = sdpa_graph.binary_select(input0=q_kt_tensor, input1=neg_inf, mask=padding_mask, name="binary_select")
+
+    return out
+
+
+def softcap(sdpa_graph, q_kt_tensor, softcap_tensor):
+
+    div_out = sdpa_graph.div(a=q_kt_tensor, b=softcap_tensor)
+
+    tanh_out = sdpa_graph.tanh(input=div_out)
+
+    out = sdpa_graph.mul(a=tanh_out, b=softcap_tensor)
+
+    return out
+
+
+def decode_mask(sdpa_graph, q_kt_tensor, seq_len_kv, seq_len_q, neg_inf, softcap_tensor):
+
+    softcap_out = softcap(sdpa_graph, q_kt_tensor, softcap_tensor)
+
+    out = padding_mask(sdpa_graph, softcap_out, seq_len_q, seq_len_kv, neg_inf)
+
+    return out
+
+
+def causal_mask(sdpa_graph, q_kt_tensor, neg_inf):
+
+    row_index = sdpa_graph.gen_index(input=q_kt_tensor, axis=2)
+    row_index.set_data_type(cudnn.data_type.INT32)
+
+    col_index = sdpa_graph.gen_index(input=q_kt_tensor, axis=3)
+    col_index.set_data_type(cudnn.data_type.INT32)
+
+    mask = sdpa_graph.cmp_ge(input=row_index, comparison=col_index, compute_data_type=cudnn.data_type.BOOLEAN)
+    mask.set_data_type(cudnn.data_type.BOOLEAN)
+
+    out = sdpa_graph.binary_select(input0=q_kt_tensor, input1=neg_inf, mask=mask)
+
+    return out
+
+
+def constant_bound_mask(score_mod_graph, index, bound):
+    is_less_than_bound = score_mod_graph.cmp_lt(input=index, comparison=bound, compute_data_type=cudnn.data_type.BOOLEAN)
+    is_less_than_bound.set_data_type(cudnn.data_type.INT32)
+
+    return is_less_than_bound
+
+
+def diag_bound_mask(score_mod_graph, row_index, col_index, diag_bound_0, diag_bound_1):
+    row_minus_col = score_mod_graph.sub(a=row_index, b=col_index, compute_data_type=cudnn.data_type.INT32)
+    row_minus_col.set_data_type(cudnn.data_type.INT32)
+    is_larger_or_equal_to_diag_bound_0 = score_mod_graph.cmp_ge(
+        input=row_minus_col,
+        comparison=diag_bound_0,
+        compute_data_type=cudnn.data_type.BOOLEAN,
+    )
+    is_larger_or_equal_to_diag_bound_0.set_data_type(cudnn.data_type.INT32)
+
+    is_less_than_or_equal_to_diag_bound_1 = score_mod_graph.cmp_le(
+        input=row_minus_col,
+        comparison=diag_bound_1,
+        compute_data_type=cudnn.data_type.BOOLEAN,
+    )
+    is_less_than_or_equal_to_diag_bound_1.set_data_type(cudnn.data_type.INT32)
+
+    is_within_diag_bound = score_mod_graph.logical_and(
+        is_larger_or_equal_to_diag_bound_0,
+        is_less_than_or_equal_to_diag_bound_1,
+        compute_data_type=cudnn.data_type.BOOLEAN,
+    )
+    is_within_diag_bound.set_data_type(cudnn.data_type.INT32)
+
+    return is_within_diag_bound
+
+
+def arrow_mask(
+    score_mod_graph,
+    q_kt_tensor,
+    row_bound,
+    col_bound,
+    diag_bound_0,
+    diag_bound_1,
+    neg_inf,
+):
+    row_index = score_mod_graph.gen_index(input=q_kt_tensor, axis=2)
+    row_index.set_data_type(cudnn.data_type.INT32)
+
+    col_index = score_mod_graph.gen_index(input=q_kt_tensor, axis=3)
+    col_index.set_data_type(cudnn.data_type.INT32)
+
+    is_less_than_row_bound = constant_bound_mask(score_mod_graph, row_index, row_bound)
+    is_less_than_col_bound = constant_bound_mask(score_mod_graph, col_index, col_bound)
+
+    is_within_diag_bound = diag_bound_mask(score_mod_graph, row_index, col_index, diag_bound_0, diag_bound_1)
+
+    mask = score_mod_graph.logical_or(
+        is_less_than_row_bound,
+        is_less_than_col_bound,
+        compute_data_type=cudnn.data_type.BOOLEAN,
+    )
+    mask.set_data_type(cudnn.data_type.INT32)
+
+    mask = score_mod_graph.logical_or(mask, is_within_diag_bound, compute_data_type=cudnn.data_type.BOOLEAN)
+    mask.set_data_type(cudnn.data_type.INT32)
+
+    out = score_mod_graph.binary_select(input0=q_kt_tensor, input1=neg_inf, mask=mask)
+
+    return out
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_sdpa_with_flexible_graph(cudnn_handle):
+
+    b = 4  # batch size
+    h_q = 12  # query number of heads
+    h_k = 12  # key number of heads
+    h_v = 12  # value number of heads
+    s_q = 1  # maximum sequence length
+    s_kv = 32 * 1024  # maximum sequence length
+    d = 128  # embedding dimension per head
+
+    attn_scale = 1.0 / math.sqrt(d)
+
+    block_size_k, block_size_v = 1, 1
+
+    softcap_scalar_value = 0.8
+    neg_inf_scalar_value = -1e9
+
+    q_dims = (b, h_q, s_q, d)
+    q_strides = (s_q * h_q * d, d, h_q * d, 1)
+    k_dims = (b, h_k, s_kv, d)
+    k_strides = (s_kv * h_k * d, d, h_k * d, 1)
+    v_dims = (b, h_v, s_kv, d)
+    v_strides = (s_kv * h_v * d, d, h_v * d, 1)
+
+    q_gpu = torch.randn(b * s_q * h_q * d).half().cuda().as_strided(q_dims, q_strides)
+    k_gpu = torch.randn(b * s_kv * h_k * d).half().cuda().as_strided(k_dims, k_strides)
+    v_gpu = torch.randn(b * s_kv * h_v * d).half().cuda().as_strided(v_dims, v_strides)
+    o_gpu = torch.empty(b * s_q * h_q * d).half().cuda().as_strided(q_dims, q_strides)
+
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+
+    if cudnn_version < "9.6.0":
+        pytest.skip("SDPA fprop with paged attention requires cudnn 9.6.0 or higher")
+
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    q = graph.tensor_like(q_gpu)
+    container_k_gpu, page_table_k_gpu = create_container_and_page_table(k_gpu, block_size_k)
+    container_v_gpu, page_table_v_gpu = create_container_and_page_table(v_gpu, block_size_v)
+
+    container_k = graph.tensor_like(container_k_gpu)
+    container_v = graph.tensor_like(container_v_gpu)
+    page_table_k = graph.tensor_like(page_table_k_gpu)
+    page_table_v = graph.tensor_like(page_table_v_gpu)
+
+    seq_len_q_gpu = torch.randint(1, s_q + 1, (b, 1, 1, 1), dtype=torch.int32, device="cuda")
+    seq_len_kv_gpu = torch.randint(1, s_kv + 1, (b, 1, 1, 1), dtype=torch.int32, device="cuda")
+
+    seq_len_q = graph.tensor_like(seq_len_q_gpu)
+    seq_len_kv = graph.tensor_like(seq_len_kv_gpu)
+
+    softcap_tensor_cpu = torch.full((1, 1, 1, 1), softcap_scalar_value)
+    neg_inf_tensor_cpu = torch.full((1, 1, 1, 1), neg_inf_scalar_value)
+    softcap_tensor = graph.tensor(
+        name="softcap_scalar",
+        dim=softcap_tensor_cpu.size(),
+        stride=softcap_tensor_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=softcap_tensor_cpu.dtype,
+    )
+
+    neg_inf_tensor = graph.tensor(
+        name="neg_inf_scalar",
+        dim=neg_inf_tensor_cpu.size(),
+        stride=neg_inf_tensor_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=neg_inf_tensor_cpu.dtype,
+    )
+
+    o, _ = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=container_k,
+        v=container_v,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        generate_stats=False,
+        attn_scale=attn_scale,
+        use_causal_mask=False,
+        paged_attention_k_table=page_table_k,  # Page Table K: Tensor containing offsets to the container with K blocks
+        paged_attention_v_table=page_table_v,  # Page Table V: Tensor containing offsets to the container with V blocks
+        paged_attention_max_seq_len_kv=s_kv,  # The maximum sequence length for K caches (this is optional, but recommended)
+        score_mod=partial(
+            decode_mask,
+            softcap_tensor=softcap_tensor,
+            neg_inf=neg_inf_tensor,
+            seq_len_q=seq_len_q,
+            seq_len_kv=seq_len_kv,
+        ),
+    )
+
+    o.set_output(True).set_dim(q_dims).set_stride(q_strides)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    variant_pack = {
+        q: q_gpu,
+        container_k: container_k_gpu,
+        container_v: container_v_gpu,
+        page_table_k: page_table_k_gpu,
+        page_table_v: page_table_v_gpu,
+        o: o_gpu,
+        softcap_tensor: softcap_tensor_cpu,
+        neg_inf_tensor: neg_inf_tensor_cpu,
+        seq_len_q: seq_len_q_gpu,
+        seq_len_kv: seq_len_kv_gpu,
+    }
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute(variant_pack, workspace)
+    torch.cuda.synchronize()
+
+
+def document_mask(sdpa_graph, q_kt_tensor, document_tensor, document_tensor_t, neg_inf):
+
+    col_index = sdpa_graph.gen_index(
+        input=q_kt_tensor,
+        axis=3,
+        name="col_index",
+        compute_data_type=cudnn.data_type.INT32,
+    )
+    col_index.set_data_type(cudnn.data_type.INT32)
+
+    all_1_mask = sdpa_graph.cmp_le(
+        input=col_index,
+        comparison=col_index,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="all_1_mask",
+    )
+    all_1_mask.set_data_type(cudnn.data_type.INT32)
+
+    row_bcast = sdpa_graph.scale(
+        input=all_1_mask,
+        scale=document_tensor,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="row_bcast",
+    )
+    row_bcast.set_data_type(cudnn.data_type.INT32)
+
+    col_bcast = sdpa_graph.scale(
+        input=all_1_mask,
+        scale=document_tensor_t,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="col_bcast",
+    )
+    col_bcast.set_data_type(cudnn.data_type.INT32)
+
+    document_mask = sdpa_graph.cmp_eq(
+        input=row_bcast,
+        comparison=col_bcast,
+        compute_data_type=cudnn.data_type.FLOAT,
+        name="document_mask",
+    )
+    document_mask.set_data_type(cudnn.data_type.INT32)
+
+    out = sdpa_graph.binary_select(input0=q_kt_tensor, input1=neg_inf, mask=document_mask, name="binary_select")
+
+    return out
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_sdpa_with_arrow_mask(cudnn_handle):
+
+    b = 4  # batch size
+    h_q = 12  # query number of heads
+    h_k = 12  # key number of heads
+    h_v = 12  # value number of heads
+    s_q = 256  # maximum sequence length
+    s_kv = 256  # maximum sequence length
+    d = 128  # embedding dimension per head
+
+    attn_scale = 1.0 / math.sqrt(d)
+
+    row_bound_val = 2
+    col_bound_val = 2
+    diag_bound_0_val = 1
+    diag_bound_1_val = 1
+
+    neg_inf_scalar_value = -1e9
+
+    q_dims = (b, h_q, s_q, d)
+    q_strides = (s_q * h_q * d, d, h_q * d, 1)
+    k_dims = (b, h_k, s_kv, d)
+    k_strides = (s_kv * h_k * d, d, h_k * d, 1)
+    v_dims = (b, h_v, s_kv, d)
+    v_strides = (s_kv * h_v * d, d, h_v * d, 1)
+
+    q_gpu = torch.randn(b * s_q * h_q * d).half().cuda().as_strided(q_dims, q_strides)
+    k_gpu = torch.randn(b * s_kv * h_k * d).half().cuda().as_strided(k_dims, k_strides)
+    v_gpu = torch.randn(b * s_kv * h_v * d).half().cuda().as_strided(v_dims, v_strides)
+    o_gpu = torch.empty(b * s_q * h_q * d).half().cuda().as_strided(q_dims, q_strides)
+
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    q = graph.tensor_like(q_gpu)
+
+    k = graph.tensor_like(k_gpu)
+    v = graph.tensor_like(v_gpu)
+
+    neg_inf_tensor_cpu = torch.full((1, 1, 1, 1), neg_inf_scalar_value)
+    row_bound_cpu = torch.full((1, 1, 1, 1), row_bound_val, dtype=torch.int32)
+    col_bound_cpu = torch.full((1, 1, 1, 1), col_bound_val, dtype=torch.int32)
+    diag_bound_0_cpu = torch.full((1, 1, 1, 1), diag_bound_0_val, dtype=torch.int32)
+    diag_bound_1_cpu = torch.full((1, 1, 1, 1), diag_bound_1_val, dtype=torch.int32)
+    row_bound = graph.tensor(
+        name="row_bound",
+        dim=row_bound_cpu.size(),
+        stride=row_bound_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=row_bound_cpu.dtype,
+    )
+    col_bound = graph.tensor(
+        name="col_bound",
+        dim=col_bound_cpu.size(),
+        stride=col_bound_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=col_bound_cpu.dtype,
+    )
+    diag_bound_0 = graph.tensor(
+        name="diag_bound_0",
+        dim=diag_bound_0_cpu.size(),
+        stride=diag_bound_0_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=diag_bound_0_cpu.dtype,
+    )
+    diag_bound_1 = graph.tensor(
+        name="diag_bound_1",
+        dim=diag_bound_1_cpu.size(),
+        stride=diag_bound_1_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=diag_bound_1_cpu.dtype,
+    )
+    neg_inf_tensor = graph.tensor(
+        name="neg_inf_scalar",
+        dim=neg_inf_tensor_cpu.size(),
+        stride=neg_inf_tensor_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=neg_inf_tensor_cpu.dtype,
+    )
+
+    o, _ = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=k,
+        v=v,
+        generate_stats=False,
+        attn_scale=attn_scale,
+        score_mod=partial(
+            arrow_mask,
+            row_bound=row_bound,
+            col_bound=col_bound,
+            diag_bound_0=diag_bound_0,
+            diag_bound_1=diag_bound_1,
+            neg_inf=neg_inf_tensor,
+        ),
+    )
+
+    o.set_output(True).set_dim(q_dims).set_stride(q_strides)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    variant_pack = {
+        q: q_gpu,
+        k: k_gpu,
+        v: v_gpu,
+        o: o_gpu,
+        neg_inf_tensor: neg_inf_tensor_cpu,
+        row_bound: row_bound_cpu,
+        col_bound: col_bound_cpu,
+        diag_bound_0: diag_bound_0_cpu,
+        diag_bound_1: diag_bound_1_cpu,
+    }
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute(variant_pack, workspace)
+    torch.cuda.synchronize()
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_sdpa_with_document_mask(cudnn_handle):
+
+    b = 1  # batch size
+    h_q = 1  # query number of heads
+    h_k = 1  # key number of heads
+    h_v = 1  # value number of heads
+    s_q = 16  # maximum sequence length
+    s_kv = 16  # maximum sequence length
+    d = 128  # embedding dimension per head
+
+    q_dims = (b, h_q, s_q, d)
+    q_strides = (s_q * h_q * d, d, h_q * d, 1)
+    k_dims = (b, h_k, s_kv, d)
+    k_strides = (s_kv * h_k * d, d, h_k * d, 1)
+    v_dims = (b, h_v, s_kv, d)
+    v_strides = (s_kv * h_v * d, d, h_v * d, 1)
+
+    q_gpu = torch.randn(b * s_q * h_q * d).half().cuda().as_strided(q_dims, q_strides)
+    k_gpu = torch.randn(b * s_kv * h_k * d).half().cuda().as_strided(k_dims, k_strides)
+    v_gpu = torch.randn(b * s_kv * h_v * d).half().cuda().as_strided(v_dims, v_strides)
+    o_gpu = torch.empty(b * s_q * h_q * d).half().cuda().as_strided(q_dims, q_strides)
+
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+
+    if cudnn_version < "9.9.0":
+        pytest.skip("SDPA fprop with document style mask requires cudnn 9.9.0 or higher")
+
+    document_tensor_gpu = torch.randint(0, s_q, (1, 1, s_q, 1), device="cuda", dtype=torch.int32).sort(dim=2).values
+    document_tensor_gpu_t = document_tensor_gpu.reshape(1, 1, 1, s_q)
+
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    q = graph.tensor_like(q_gpu)
+    k = graph.tensor_like(k_gpu)
+    v = graph.tensor_like(v_gpu)
+
+    document_tensor = graph.tensor_like(document_tensor_gpu)
+    document_tensor_t = graph.tensor_like(document_tensor_gpu_t)
+
+    attn_scale = 1.0 / math.sqrt(d)
+    neg_inf_scalar_value = -1e9
+    neg_inf_tensor_cpu = torch.full((1, 1, 1, 1), neg_inf_scalar_value)
+
+    neg_inf_tensor = graph.tensor(
+        name="neg_inf_scalar",
+        dim=neg_inf_tensor_cpu.size(),
+        stride=neg_inf_tensor_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=neg_inf_tensor_cpu.dtype,
+    )
+
+    o, _ = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=k,
+        v=v,
+        generate_stats=False,
+        attn_scale=attn_scale,
+        score_mod=partial(
+            document_mask,
+            document_tensor=document_tensor,
+            document_tensor_t=document_tensor_t,
+            neg_inf=neg_inf_tensor,
+        ),
+    )
+
+    o.set_output(True).set_dim(q_dims).set_stride(q_strides)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    variant_pack = {
+        q: q_gpu,
+        k: k_gpu,
+        v: v_gpu,
+        o: o_gpu,
+        document_tensor: document_tensor_gpu,
+        document_tensor_t: document_tensor_gpu_t,
+        neg_inf_tensor: neg_inf_tensor_cpu,
+    }
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute(variant_pack, workspace)
+    torch.cuda.synchronize()
diff --git a/third_party/cudnn-frontend/test/python/test_instancenorm.py b/third_party/cudnn-frontend/test/python/test_instancenorm.py
new file mode 100644
index 00000000..05096880
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_instancenorm.py
@@ -0,0 +1,200 @@
+import cudnn
+import pytest
+import torch
+import itertools
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+input_type_options = [torch.bfloat16, torch.float16]
+
+all_options = [
+    elem
+    for elem in itertools.product(
+        *[
+            input_type_options,
+        ]
+    )
+]
+
+
+@pytest.fixture(params=all_options)
+def param_extract(request):
+    return request.param
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9.5",
+    reason="IN not supported below cudnn 8.9.5",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_in(param_extract, cudnn_handle):
+
+    (input_type,) = param_extract
+
+    if input_type == torch.bfloat16:
+        atol, rtol = 0.125, 0.125
+    else:
+        atol, rtol = 1e-2, 1e-2
+
+    N, C, H, W = 16, 32, 64, 64
+
+    epsilon_value = 1e-5
+
+    x_gpu = torch.randn((N, C, H, W), requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last)
+    scale_gpu = torch.randn((1, C, 1, 1), requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last)
+    bias_gpu = torch.randn((1, C, 1, 1), requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last)
+    epsilon_cpu = torch.full(
+        (1, 1, 1, 1),
+        epsilon_value,
+        requires_grad=False,
+        device="cpu",
+        dtype=torch.float32,
+    )
+
+    Y_expected = torch.nn.functional.instance_norm(x_gpu, weight=scale_gpu.view(C), bias=bias_gpu.view(C))
+    mean_expected = x_gpu.to(torch.float32).mean(dim=(2, 3), keepdim=True)
+    inv_var_expected = torch.rsqrt(torch.var(x_gpu.to(torch.float32), dim=(2, 3), keepdim=True) + epsilon_value)
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor_like(x_gpu.detach())
+    scale = graph.tensor_like(scale_gpu.detach())
+    bias = graph.tensor_like(bias_gpu.detach())
+    epsilon = graph.tensor_like(epsilon_cpu)
+
+    Y, mean, inv_var = graph.instancenorm(
+        name="IN",
+        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
+        input=X,
+        scale=scale,
+        bias=bias,
+        epsilon=epsilon,
+    )
+
+    Y.set_output(True).set_data_type(x_gpu.dtype)
+    mean.set_output(True).set_data_type(mean_expected.dtype)
+    inv_var.set_output(True).set_data_type(inv_var_expected.dtype)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans()
+
+    Y_actual = torch.empty_like(x_gpu)
+    mean_actual = torch.empty_like(mean_expected)
+    inv_var_actual = torch.empty_like(inv_var_expected)
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    graph.execute(
+        {
+            X: x_gpu.detach(),
+            scale: scale_gpu.detach(),
+            bias: bias_gpu.detach(),
+            epsilon: epsilon_cpu,
+            Y: Y_actual,
+            mean: mean_actual,
+            inv_var: inv_var_actual,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    print("Comparing with reference")
+    torch.testing.assert_close(Y_expected, Y_actual, atol=atol, rtol=rtol)
+    torch.testing.assert_close(mean_expected, mean_actual, atol=atol, rtol=rtol)
+    torch.testing.assert_close(inv_var_expected, inv_var_actual, atol=atol, rtol=rtol)
+    print("Success!!")
+
+    target = torch.randn_like(Y_expected)
+    criterion = torch.nn.MSELoss()
+    loss = criterion(Y_expected, target)
+
+    Y_expected.retain_grad()
+    x_gpu.retain_grad()
+    scale_gpu.retain_grad()
+    bias_gpu.retain_grad()
+
+    loss.backward()
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    bwd_graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    # https://github.com/pytorch/pytorch/issues/72341
+    # PyT does not preserve layout for IN
+    DY_gpu = Y_expected.grad.to(memory_format=torch.channels_last)
+
+    DY = bwd_graph.tensor_like(DY_gpu)
+    X_bwd = bwd_graph.tensor_like(x_gpu.detach())
+    scale_bwd = bwd_graph.tensor_like(scale_gpu.detach())
+    mean_bwd = bwd_graph.tensor_like(mean_actual.detach())
+    inv_var_bwd = bwd_graph.tensor_like(inv_var_actual.detach())
+    epsilon_bwd = bwd_graph.tensor_like(epsilon_cpu)
+
+    DX, Dscale, Dbias = bwd_graph.instancenorm_backward(
+        name="DIN",
+        grad=DY,
+        input=X_bwd,
+        scale=scale_bwd,
+        mean=mean_bwd,
+        inv_variance=inv_var_bwd,
+    )
+
+    DX.set_output(True).set_data_type(x_gpu.dtype)
+    Dscale.set_output(True).set_data_type(scale_gpu.dtype)
+    Dbias.set_output(True).set_data_type(bias_gpu.dtype)
+
+    bwd_graph.validate()
+    bwd_graph.build_operation_graph()
+    bwd_graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    bwd_graph.check_support()
+    bwd_graph.build_plans()
+
+    DX_actual = torch.empty_like(x_gpu)
+    DScale_actual = torch.empty_like(scale_gpu)
+    Dbias_actual = torch.empty_like(bias_gpu)
+
+    workspace = torch.empty(bwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    bwd_graph.execute(
+        {
+            X_bwd: x_gpu.detach(),
+            scale_bwd: scale_gpu.detach(),
+            DY: DY_gpu,
+            mean_bwd: mean_actual.detach(),
+            inv_var_bwd: inv_var_actual.detach(),
+            epsilon_bwd: epsilon_cpu,
+            DX: DX_actual,
+            Dscale: DScale_actual,
+            Dbias: Dbias_actual,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(x_gpu.grad, DX_actual, atol=2e-3, rtol=2e-3)
+    torch.testing.assert_close(scale_gpu.grad, DScale_actual, atol=2e-3, rtol=2e-3)
+    torch.testing.assert_close(bias_gpu.grad, Dbias_actual, atol=2e-3, rtol=2e-3)
diff --git a/third_party/cudnn-frontend/test/python/test_kernel_cache.py b/third_party/cudnn-frontend/test/python/test_kernel_cache.py
new file mode 100644
index 00000000..bbf910c9
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_kernel_cache.py
@@ -0,0 +1,259 @@
+import cudnn
+import pytest
+import torch
+import itertools
+import time
+from looseversion import LooseVersion
+
+from collections import namedtuple
+
+"""
+Unless overridden by a specific layer, the tolerance for each data type combination
+follows this default definition.
+"""
+global_assert_opts_defaults = {
+    # "hsh": dict(atol=5e-3, rtol=5e-3),
+    # "hhh": dict(atol=1e-2, rtol=1e-2),
+    # "sss": dict(atol=5e-3, rtol=5e-3),
+    # "bbb": dict(atol=0, rtol=2e-2),
+    "default": dict(atol=5e-2, rtol=5e-2),
+}
+
+problem_defintion = namedtuple("problem_defintion", ["b", "m", "n", "k"])
+
+shapes = [
+    problem_defintion(b=16, m=32, n=32, k=128),
+    problem_defintion(b=16, m=64, n=64, k=128),
+    problem_defintion(b=16, m=80, n=80, k=128),
+    problem_defintion(b=32, m=128, n=128, k=256),
+    problem_defintion(b=32, m=64, n=64, k=256),
+]
+
+
+def build_cudnn_graph(handle, cache, shape):
+    graph = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=handle,
+        kernel_cache=cache,
+    )
+
+    A = graph.tensor(
+        name="A",
+        dim=[shape.b, shape.m, shape.k],
+        stride=[shape.m * shape.k, shape.k, 1],
+    )
+    B = graph.tensor(
+        name="B",
+        dim=[shape.b, shape.k, shape.n],
+        stride=[shape.n * shape.k, shape.n, 1],
+    )
+
+    C = graph.matmul(name="matmul", A=A, B=B)
+    C.set_output(True).set_uid(2)
+
+    A.set_uid(0)
+    B.set_uid(1)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    return graph
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.5",
+    reason="requires cudnn 9.5 or higher",
+)
+@pytest.mark.L0
+def test_kernel_cache(cudnn_handle):
+
+    cache = cudnn.create_kernel_cache()
+
+    for shape in shapes:
+        graph = build_cudnn_graph(cudnn_handle, cache, shape)
+
+        A = torch.randn(
+            shape.b,
+            shape.m,
+            shape.k,
+            requires_grad=False,
+            device="cuda",
+            dtype=torch.bfloat16,
+        )
+        B = torch.randn(
+            shape.b,
+            shape.k,
+            shape.n,
+            requires_grad=False,
+            device="cuda",
+            dtype=torch.bfloat16,
+        )
+        C = torch.randn(
+            shape.b,
+            shape.m,
+            shape.n,
+            requires_grad=False,
+            device="cuda",
+            dtype=torch.bfloat16,
+        )
+
+        workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+        print("Executing", shape)
+        graph.execute({0: A, 1: B, 2: C}, workspace, handle=cudnn_handle)
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.10",
+    reason="requires cudnn 9.10 or higher",
+)
+def test_kernel_cache_persistence(cudnn_handle):
+    kernel_cache = cudnn.create_kernel_cache()
+
+    def create_my_graph(kernel_cache):
+        g = cudnn.pygraph(
+            io_data_type=cudnn.data_type.HALF,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+            handle=cudnn_handle,
+            kernel_cache=kernel_cache,
+        )
+        A = g.tensor(
+            name="A",
+            dim=[1, 8, 8],
+            stride=[64, 8, 1],
+        )
+        B = g.tensor(
+            name="B",
+            dim=[1, 8, 8],
+            stride=[64, 8, 1],
+        )
+        C = g.matmul(name="matmul", A=A, B=B)
+        C.set_output(True).set_uid(2)
+        A.set_uid(0)
+        B.set_uid(1)
+        return g
+
+    graph = create_my_graph(kernel_cache)
+    graph.build([cudnn.heur_mode.FALLBACK])
+    str_json = kernel_cache.serialize()
+
+    # Destroy the graph to ensure the kernel cache is independent of the graph
+    del graph, kernel_cache
+
+    kernel_cache = cudnn.create_kernel_cache()
+    kernel_cache.deserialize(str_json)
+
+    # Test that deserializing onto an existing kernel cache should be prohibited
+    with pytest.raises(Exception):
+        kernel_cache.deserialize(str_json)
+
+    # Verify we can still use the deserialized kernel cache with a new graph
+    graph = create_my_graph(kernel_cache)
+
+    EXECUTION_TIME_LIMIT_MS = 10.0
+    start_time = time.time()
+    graph.build([cudnn.heur_mode.FALLBACK])
+    build_time_ms = (time.time() - start_time) * 1000
+    assert build_time_ms <= EXECUTION_TIME_LIMIT_MS, f"Graph build time {build_time_ms:.2f}ms exceeded limit of {EXECUTION_TIME_LIMIT_MS}ms"
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.10",
+    reason="requires cudnn 9.10 or higher",
+)
+def test_serialize_both_graph_and_kernel_cache(cudnn_handle):
+    kernel_cache = cudnn.create_kernel_cache()
+
+    def create_my_graph(kernel_cache, m, n, k):
+        g = cudnn.pygraph(
+            io_data_type=cudnn.data_type.HALF,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+            handle=cudnn_handle,
+            kernel_cache=kernel_cache,
+        )
+        A = g.tensor(
+            name="A",
+            dim=[1, m, k],
+            stride=[m * k, k, 1],
+        )
+        B = g.tensor(
+            name="B",
+            dim=[1, k, n],
+            stride=[k * n, n, 1],
+        )
+        C = g.matmul(name="matmul", A=A, B=B)
+        C.set_output(True).set_uid(2)
+        A.set_uid(0)
+        B.set_uid(1)
+        return g
+
+    graph = create_my_graph(kernel_cache, 8, 64, 128)
+    graph.build([cudnn.heur_mode.FALLBACK])
+    bytes_graph = graph.serialize()
+    json_cache = kernel_cache.serialize()
+
+    # Destroy the graph to ensure the kernel cache is independent of the graph
+    del graph, kernel_cache
+
+    # Verify the deserialized kernel cache and graph
+
+    kernel_cache = cudnn.create_kernel_cache()
+    kernel_cache.deserialize(json_cache)
+    graph = cudnn.pygraph(
+        # this is actually ignored as graph.deserialize() already skips graph.build()
+        kernel_cache=kernel_cache,
+    )
+    graph.deserialize(bytes_graph)
+
+    def create_tensors(m, n, k):
+        A_gpu = torch.randn(
+            1,
+            m,
+            k,
+            device="cuda",
+            dtype=torch.float16,
+        )
+        B_gpu = torch.randn(
+            1,
+            k,
+            n,
+            device="cuda",
+            dtype=torch.float16,
+        )
+        C_expected = torch.matmul(A_gpu, B_gpu).to(torch.float16)
+        C_actual = torch.empty_like(C_expected, device="cuda", dtype=torch.float16)
+        return A_gpu, B_gpu, C_expected, C_actual
+
+    A_gpu, B_gpu, C_expected, C_actual = create_tensors(8, 64, 128)
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute({0: A_gpu, 1: B_gpu, 2: C_actual}, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(C_actual, C_expected, **global_assert_opts_defaults["default"])
+
+    # try making a new one with the same kernel cache
+    del graph
+    C_actual = torch.empty_like(C_expected, device="cuda", dtype=torch.float16)
+    graph = create_my_graph(kernel_cache, 16, 64, 256)
+    EXECUTION_TIME_LIMIT_MS = 10.0
+    start_time = time.time()
+    graph.build([cudnn.heur_mode.FALLBACK])
+    build_time_ms = (time.time() - start_time) * 1000
+    assert build_time_ms <= EXECUTION_TIME_LIMIT_MS, f"Graph build time {build_time_ms:.2f}ms exceeded limit of {EXECUTION_TIME_LIMIT_MS}ms"
+    A_gpu, B_gpu, C_expected, C_actual = create_tensors(8, 64, 128)
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute({0: A_gpu, 1: B_gpu, 2: C_actual}, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(C_actual, C_expected, **global_assert_opts_defaults["default"])
diff --git a/third_party/cudnn-frontend/test/python/test_layernorm.py b/third_party/cudnn-frontend/test/python/test_layernorm.py
new file mode 100644
index 00000000..f5ee920c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_layernorm.py
@@ -0,0 +1,212 @@
+import cudnn
+import pytest
+import torch
+import itertools
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+embedding_dim_options = [768, 1024, 1280, 1600]
+input_type_options = [torch.bfloat16, torch.float16]
+
+all_options = [elem for elem in itertools.product(*[embedding_dim_options, input_type_options])]
+
+
+@pytest.fixture(params=all_options)
+def param_extract(request):
+    return request.param
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9.5",
+    reason="LN not supported below cudnn 8.9.5",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_layernorm(param_extract, cudnn_handle):
+
+    embedding_dim, input_type = param_extract
+
+    if input_type == torch.bfloat16:
+        atol, rtol = 0.125, 0.125
+    else:
+        atol, rtol = 1e-2, 1e-2
+
+    batch_size, seq_size = 16, 128
+    N, C, H, W = batch_size * seq_size, embedding_dim, 1, 1
+
+    epsilon_value = 1e-3
+
+    x_gpu = 3 * torch.randn(N, C, H, W, requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last) - 0.5
+    scale_gpu = 5 * torch.randn(1, C, H, W, requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last) - 1
+    bias_gpu = 7 * torch.randn(1, C, H, W, requires_grad=True, device="cuda", dtype=input_type).to(memory_format=torch.channels_last) - 2
+    epsilon_cpu = torch.full(
+        (1, 1, 1, 1),
+        epsilon_value,
+        requires_grad=False,
+        device="cpu",
+        dtype=torch.float32,
+    )
+
+    Y_expected = torch.nn.functional.layer_norm(
+        x_gpu,
+        [C, H, W],
+        weight=scale_gpu.squeeze(0),
+        bias=bias_gpu.squeeze(0),
+        eps=epsilon_value,
+    )
+    mean_expected = x_gpu.to(torch.float32).mean(dim=(1, 2, 3), keepdim=True)
+    inv_var_expected = torch.rsqrt(torch.var(x_gpu.to(torch.float32), dim=(1, 2, 3), keepdim=True) + epsilon_value)
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor(name="X", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype)
+    scale = graph.tensor(
+        name="scale",
+        dim=scale_gpu.size(),
+        stride=scale_gpu.stride(),
+        data_type=scale_gpu.dtype,
+    )
+    bias = graph.tensor(
+        name="bias",
+        dim=bias_gpu.size(),
+        stride=bias_gpu.stride(),
+        data_type=bias_gpu.dtype,
+    )
+    epsilon = graph.tensor(
+        name="epsilon",
+        dim=epsilon_cpu.size(),
+        stride=epsilon_cpu.stride(),
+        is_pass_by_value=True,
+        data_type=epsilon_cpu.dtype,
+    )
+
+    Y, mean, inv_var = graph.layernorm(
+        name="LN",
+        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
+        input=X,
+        scale=scale,
+        bias=bias,
+        epsilon=epsilon,
+    )
+
+    Y.set_output(True).set_data_type(x_gpu.dtype)
+    mean.set_output(True).set_data_type(mean_expected.dtype)
+    inv_var.set_output(True).set_data_type(inv_var_expected.dtype)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.check_support()
+    graph.build_plans()
+
+    Y_actual = torch.empty_like(x_gpu)
+    mean_actual = torch.empty_like(mean_expected)
+    inv_var_actual = torch.empty_like(inv_var_expected)
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    graph.execute(
+        {
+            X: x_gpu.detach(),
+            scale: scale_gpu.detach(),
+            bias: bias_gpu.detach(),
+            epsilon: epsilon_cpu,
+            Y: Y_actual,
+            mean: mean_actual,
+            inv_var: inv_var_actual,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.testing.assert_close(Y_expected, Y_actual, atol=atol, rtol=rtol)
+    torch.testing.assert_close(mean_expected, mean_actual, atol=atol, rtol=rtol)
+    torch.testing.assert_close(inv_var_expected, inv_var_actual, atol=atol, rtol=rtol)
+
+    target = torch.randn_like(Y_expected)
+    criterion = torch.nn.MSELoss()
+    loss = criterion(Y_expected, target)
+
+    Y_expected.retain_grad()
+    x_gpu.retain_grad()
+    scale_gpu.retain_grad()
+    bias_gpu.retain_grad()
+
+    loss.backward()
+
+    bwd_graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    DY = bwd_graph.tensor(name="DY", dim=x_gpu.size(), stride=x_gpu.stride(), data_type=x_gpu.dtype)
+    X_bwd = bwd_graph.tensor_like(X, name="X")
+    scale_bwd = bwd_graph.tensor_like(scale, name="scale")
+    mean_bwd = bwd_graph.tensor_like(mean, name="mean")
+    inv_var_bwd = bwd_graph.tensor_like(inv_var, name="inv_var")
+
+    DX, Dscale, Dbias = bwd_graph.layernorm_backward(
+        name="DLN",
+        grad=DY,
+        input=X_bwd,
+        scale=scale_bwd,
+        mean=mean_bwd,
+        inv_variance=inv_var_bwd,
+    )
+
+    DX.set_output(True).set_data_type(x_gpu.dtype)
+    Dscale.set_output(True).set_data_type(x_gpu.dtype)
+    Dbias.set_output(True).set_data_type(x_gpu.dtype)
+
+    bwd_graph.validate()
+    bwd_graph.build_operation_graph()
+
+    try:
+        bwd_graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    bwd_graph.check_support()
+    bwd_graph.build_plans()
+
+    DX_actual = torch.empty_like(x_gpu)
+    DScale_actual = torch.empty_like(scale_gpu)
+    Dbias_actual = torch.empty_like(bias_gpu)
+
+    workspace = torch.empty(bwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    bwd_graph.execute(
+        {
+            X_bwd: x_gpu.detach(),
+            scale_bwd: scale_gpu.detach(),
+            DY: Y_expected.grad,
+            mean_bwd: mean_actual.detach(),
+            inv_var_bwd: inv_var_actual.detach(),
+            DX: DX_actual,
+            Dscale: DScale_actual,
+            Dbias: Dbias_actual,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(x_gpu.grad, DX_actual, atol=2e-4, rtol=2e-4)
+    torch.testing.assert_close(scale_gpu.grad, DScale_actual, atol=2e-4, rtol=2e-4)
+    torch.testing.assert_close(bias_gpu.grad, Dbias_actual, atol=2e-4, rtol=2e-4)
diff --git a/third_party/cudnn-frontend/test/python/test_low_precision_matmul.py b/third_party/cudnn-frontend/test/python/test_low_precision_matmul.py
new file mode 100644
index 00000000..0410009a
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_low_precision_matmul.py
@@ -0,0 +1,436 @@
+import cudnn
+import itertools
+import pytest
+import torch
+
+from test_utils import torch_fork_set_rng
+
+if not hasattr(torch, "float4_e2m1fn_x2"):
+    pytest.skip(
+        "Current torch version does not support float4_e2m1fn_x2",
+        allow_module_level=True,
+    )
+
+# copy-pasted from
+# https://github.com/pytorch/pytorch/blob/011026205a9d4c38458130f8ca242028f6184bf0/torch/testing/_internal/common_quantized.py#L234C1-L351C29
+
+
+# copied from https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/mx/to_blocked.py
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+# copy-pasted from
+# https://github.com/pytorch/ao/blob/bc4f51da86956275da7db0da6e420c506df97820/torchao/prototype/custom_fp_utils.py#L27C1-L142C29
+def _n_ones(n: int) -> int:
+    return (1 << n) - 1
+
+
+FP4_EBITS = 2
+FP4_MBITS = 1
+
+EBITS_F32, MBITS_F32 = 8, 23
+F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    elif torch_type == torch.float4_e2m1fn_x2:
+        return cudnn.data_type.FP4_E2M1
+    elif torch_type == torch.float8_e4m3fn:
+        return cudnn.data_type.FP8_E4M3
+    elif torch_type == torch.float8_e5m2fn:
+        return cudnn.data_type.FP8_E5M2
+    elif torch_type == torch.float8_e8m0fnu:
+        return cudnn.data_type.FP8_E8M0
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+def _f32_to_floatx_unpacked(x: torch.Tensor, ebits: int, mbits: int) -> torch.Tensor:
+    """Convert FP32 numbers to sub-byte floating point numbers with the given
+    number of exponent and mantissa bits.
+
+    Input: torch.Tensor of dtype torch.float
+    Output: torch.Tensor of dtype torch.uint8, where the bit encoding is stored
+    in the least significant bits. e.g.
+      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
+      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
+
+    Note: there are no special values (NaN, inf) support in this code. Values
+    outside the representable range of Floatx after rounding are clamped to the
+    maximum Floatx magnitude (sign is preserved).
+
+    Code below is an adaptation of https://fburl.com/code/ciwofcg4
+
+    Background 1: last answer in https://stackoverflow.com/q/8981913
+    Background 2: Computer Organization and Design, RISC-V edition, Chapter 3.5
+    """
+    assert x.dtype == torch.float
+    assert 1 + ebits + mbits <= 8
+
+    # calculate constants
+    exp_bias = _n_ones(ebits - 1)
+    max_int = _n_ones(ebits + mbits)
+    sign_mask = 1 << (ebits + mbits)
+
+    # TODO document this better
+    magic_adder = _n_ones(MBITS_F32 - mbits - 1)
+
+    # all E bits and M bits are 1s
+    max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2**mbits))
+
+    # E bits = 1, M bits = 0
+    min_normal = 2 ** (1 - exp_bias)
+
+    denorm_exp = (
+        # exp bias conversion between formats
+        (F32_EXP_BIAS - exp_bias)
+        # mantissa length difference between formats
+        + (MBITS_F32 - mbits)
+        # add one to encoded exponent for denormalized numbers
+        + 1
+    )
+    denorm_mask_int = denorm_exp << MBITS_F32
+
+    # reinterpret int32 as float32
+    denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view(torch.float32)
+
+    # save the sign
+    # Note that we have torch.uint32, but some ops like cpu bit shifts
+    # do not work on it. So, we stay in int32.
+    x = x.view(torch.int32)
+    sign = x & 0x80000000
+
+    # set everything to positive, will add sign back at the end
+    x = x ^ sign
+
+    # TODO: can the branch floating point comparisons below be done without
+    # converting to float? probably but need to verify
+    x = x.view(torch.float)
+
+    # rewrite saturate/denorm/norm branches without explicit data dependent
+    # control flow, to be more compiler friendly
+    saturate_mask = x >= max_normal
+    denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal)
+    normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask))
+
+    #
+    # branch 1: saturate to max val - handled later in the code which combines
+    #   the branches
+    #
+
+    #
+    # branch 2: to conversion to denormal as well as rounding up to normal
+    #
+    denormal_x = x + denorm_mask_float
+    denormal_x = denormal_x.view(torch.int32)
+    denormal_x -= denorm_mask_int
+    denormal_x = denormal_x.to(torch.uint8)
+
+    #
+    # branch 3: stay in normal range, adjust the exponent and round
+    #
+    normal_x = x.view(torch.int32)
+    # resulting mantissa is odd
+    mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1
+    # update exponent, rounding bias part 1
+    val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder
+    normal_x += val_to_add
+    # rounding bias part 2
+    normal_x += mant_odd
+    # take the bits!
+    normal_x = normal_x >> (MBITS_F32 - mbits)
+    normal_x = normal_x.to(torch.uint8)
+
+    #
+    # combine the branches
+    #
+    x = torch.full_like(x, max_int, dtype=torch.uint8)
+    x = torch.where(denormal_mask, denormal_x, x)
+    x = torch.where(normal_mask, normal_x, x)
+
+    # add sign back
+    sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
+    sign_lp = sign_lp.to(torch.uint8)
+    # Right shift of a negative signed integer can fill the least significant
+    # bits with either 1s or 0s, depending on the implementation. Since PyTorch
+    # doesn't have an uint32 dtype, we mask out these bits to get just the
+    # f4 sign bit
+    sign_lp = sign_lp & sign_mask
+    x = x | sign_lp
+
+    return x.to(torch.uint8)
+
+
+def _floatx_unpacked_to_f32(x: torch.Tensor, ebits: int, mbits: int) -> torch.Tensor:
+    """Convert sub-byte floating point numbers with the given number of exponent
+    and mantissa bits to FP32.
+
+    Input: torch.Tensor of dtype uint8, where the bit encoding is stored
+    in the least significant bits. e.g.
+      fp4: bits 0-3 empty and bits 4-7 in fp4_e2m1 encoding
+      fp6: bits 0-1 empty and bits 2-7 in fp6_e2m3 or fp6_e3m2 encoding
+    Output: torch.Tensor of dtype fp32 with the dequantized value
+    """
+    assert x.dtype == torch.uint8
+    assert 1 + ebits + mbits <= 8
+
+    sign_mask = 1 << (ebits + mbits)
+    exp_bias = _n_ones(ebits - 1)
+    mantissa_mask = _n_ones(mbits)
+
+    # save the sign
+    sign_lp = x & sign_mask
+
+    # set everything to positive, will add sign back at the end
+    x_pos = x ^ sign_lp
+
+    #
+    # 1. Calculate zero mask
+    #
+    zero_mask = x_pos == 0
+
+    #
+    # 2. Calculate the denormal path mask
+    #
+    denormal_mask = torch.logical_and((x_pos > 0), ((x_pos >> mbits) == 0))
+
+    #
+    # 3. Calculate the normal path
+    #
+
+    # calculate the new exponent and shift it to bits 2:9 of the result
+    exp_biased_lp = x_pos >> mbits
+    exp_biased_f32 = exp_biased_lp - exp_bias + F32_EXP_BIAS
+    exp_biased_f32 = exp_biased_f32.to(torch.int32) << MBITS_F32
+
+    # shift the mantissa to bits 10:32 of the result
+    mantissa_lp_int32 = (x_pos & mantissa_mask).to(torch.int32)
+    mantissa_f32 = mantissa_lp_int32 << (MBITS_F32 - mbits)
+    result = exp_biased_f32 | mantissa_f32
+
+    #
+    # 4. Add the zero and denormal casts to the already casted normal path
+    #
+    result[zero_mask] = 0
+
+    denormal_exp_biased = 1 - exp_bias + F32_EXP_BIAS
+
+    # fast path.
+    # without this, performance for FP4_E2M1 is slower by 2x
+    if mbits == 1:
+        result[denormal_mask] = (denormal_exp_biased - mbits) << MBITS_F32
+
+    else:
+        # iterate over all possible values of mantissa
+        # i=0, j=1
+        # i=1, j=10,11
+        # i=2, j=100,101,110,111
+        # and so on
+        for i in range(mbits):
+            for mantissa_cmp in range(1 << i, 1 << (i + 1)):
+                # left shift mantissa until it overflows (create an implicit 1)
+                # subtract exponent by the same amount
+                left_shift = mbits - i
+                mantissa_f32 = (mantissa_cmp - (1 << i)) << (left_shift + MBITS_F32 - mbits)
+                exp_biased_f32 = (denormal_exp_biased - left_shift) << MBITS_F32
+
+                # we can update this in-place since the values won't overlap
+                # torch.compile() may complain unsupported operand type(s) for |: 'SymInt' and 'int'
+                # thus we use + instead of | here
+                mantissa_lp_int32[mantissa_lp_int32 == mantissa_cmp] = exp_biased_f32 + mantissa_f32
+
+        result = torch.where(denormal_mask, mantissa_lp_int32, result)
+
+    # add sign back
+    sign_f32 = sign_lp.to(torch.int32) << (MBITS_F32 - mbits + EBITS_F32 - ebits)
+    result = result | sign_f32
+
+    return result.view(torch.float)
+
+
+def get_cc():
+    major, minor = torch.cuda.get_device_capability()
+    return major * 10 + minor
+
+
+def matmul_dequantize_cache_key(cudnn_handle, A, B, A_scale, B_scale, BLOCK_SIZE):
+    return (
+        tuple(A.shape),
+        tuple(B.shape),
+    )
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.B])
+@cudnn.graph_cache(key_fn=matmul_dequantize_cache_key)
+def create_matmul_dequantize_graph(cudnn_handle, A, B, A_descale, B_descale, BLOCK_SIZE):
+
+    with cudnn.graph(cudnn_handle) as (g, _):
+
+        batch_size, M, N, K = (
+            A_descale.shape[0],
+            A_descale.shape[1],
+            B_descale.shape[1],
+            A_descale.shape[2],
+        )
+
+        A_cudnn_tensor = g.tensor(
+            name="tensor_a",
+            dim=(batch_size, M, K),
+            stride=(M * K, K, 1),
+            data_type=convert_to_cudnn_type(A.dtype),
+        )
+
+        B_cudnn_tensor = g.tensor(
+            name="tensor_b",
+            dim=(batch_size, K, N),
+            stride=(N * K, 1, K),
+            data_type=convert_to_cudnn_type(B.dtype),
+        )
+
+        A_descale_tensor = g.tensor(
+            name="block_descale_a",
+            dim=A_descale.shape,
+            stride=(M * K, K, 1),
+            data_type=convert_to_cudnn_type(A_descale.dtype),
+            reordering_type=cudnn.tensor_reordering.F8_128x4,
+        )
+
+        B_descale_tensor = g.tensor(
+            name="block_descale_b",
+            dim=B_descale.shape,
+            stride=(N * K, 1, K),
+            data_type=convert_to_cudnn_type(B_descale.dtype),
+            reordering_type=cudnn.tensor_reordering.F8_128x4,
+        )
+
+        after_descale_a = g.block_scale_dequantize(A_cudnn_tensor, A_descale_tensor, block_size=[1, BLOCK_SIZE])
+        after_descale_b = g.block_scale_dequantize(B_cudnn_tensor, B_descale_tensor, block_size=[BLOCK_SIZE, 1])
+
+        C = g.matmul(
+            after_descale_a,
+            after_descale_b,
+            compute_data_type=cudnn.data_type.FLOAT,
+            name="GEMM",
+        )
+
+        C.set_output(True).set_data_type(cudnn.data_type.BFLOAT16)
+
+    return g, [A_cudnn_tensor, B_cudnn_tensor, A_descale_tensor, B_descale_tensor, C]
+
+
+def down_size(size):
+    assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
+    return (*size[:-1], size[-1] // 2)
+
+
+def pack_uint4(uint8_data) -> torch.Tensor:
+    # converting to uint8 for operations
+    shape = uint8_data.shape
+    assert shape[-1] % 2 == 0
+    uint8_data = uint8_data.contiguous().view(-1)
+    return (uint8_data[1::2] << 4 | uint8_data[::2]).view(down_size(shape))
+
+
+def unpack_uint4(packed_data):
+    """Unpack uint4 data from packed uint8 format.
+    Reverses the operation of pack_uint4.
+    """
+    shape = packed_data.shape
+    # Create output shape with last dimension doubled
+    unpacked_shape = (*shape[:-1], shape[-1] * 2)
+
+    # View as uint8 and flatten
+    packed_uint8 = packed_data.view(torch.uint8).contiguous().view(-1)
+
+    # Create unpacked array
+    unpacked = torch.zeros(packed_uint8.shape[0] * 2, dtype=torch.uint8, device=packed_data.device)
+
+    # Extract lower and upper 4 bits
+    unpacked[::2] = packed_uint8 & 0x0F  # Lower 4 bits
+    unpacked[1::2] = (packed_uint8 >> 4) & 0x0F  # Upper 4 bits
+
+    return unpacked.view(unpacked_shape)
+
+
+def _bfloat16_to_float4_e2m1fn_x2(x):
+    assert x.dtype == torch.bfloat16
+    x = _f32_to_floatx_unpacked(x.float(), FP4_EBITS, FP4_MBITS)
+    x = pack_uint4(x)
+    x = x.view(torch.float4_e2m1fn_x2)
+    return x
+
+
+def float4_e2m1fn_x2_to_float32(fp4_tensor):
+    """Convert torch.float4_e2m1fn_x2 tensor to torch.float32."""
+    # View as uint8 and unpack
+    unpacked_uint8 = unpack_uint4(fp4_tensor)
+
+    # Convert to float32 using the existing conversion function
+    f32_tensor = _floatx_unpacked_to_f32(unpacked_uint8, FP4_EBITS, FP4_MBITS)
+
+    return f32_tensor
+
+
+@pytest.mark.skipif(get_cc() < 100, reason="requires Blackwell or newer arch")
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_low_precision_fp4_matmul(cudnn_handle):
+    if not hasattr(torch, "float4_e2m1fn_x2"):
+        pytest.skip("Current torch version does not support float4_e2m1fn_x2")
+
+    batch_size, M, N, K = 1, 128, 128, 64
+    BLOCK_SIZE = 16
+    compute_data_type = cudnn.data_type.FLOAT
+
+    if K % 32 != 0:
+        pytest.skip("For fp4, k must be a multiple of 32")
+
+    # Create random tensors
+    A_ref = _floatx_unpacked_to_f32(
+        torch.randint(0, 16, (batch_size, M, K), device="cuda", dtype=torch.uint8),
+        FP4_EBITS,
+        FP4_MBITS,
+    ).bfloat16()
+    B_ref = _floatx_unpacked_to_f32(
+        torch.randint(0, 16, (batch_size, K, N), device="cuda", dtype=torch.uint8),
+        FP4_EBITS,
+        FP4_MBITS,
+    ).bfloat16()
+
+    print("\n\n")
+
+    A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+    B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+
+    A_descale = torch.full((batch_size, M, K), 1.0, dtype=torch.float8_e4m3fn, device="cuda")
+    B_descale = torch.full((batch_size, K, N), 1.0, device="cuda", dtype=torch.float8_e4m3fn)
+
+    g, uids = create_matmul_dequantize_graph(cudnn_handle, A, B, A_descale, B_descale, BLOCK_SIZE)
+
+    A_uid, B_uid, A_descale_uid, B_descale_uid, C_uid = uids
+
+    C = torch.empty((batch_size, M, N), device="cuda", dtype=torch.bfloat16)
+
+    variant_pack = {
+        A_uid: A,
+        B_uid: B,
+        A_descale_uid: A_descale,
+        B_descale_uid: B_descale,
+        C_uid: C,
+    }
+
+    workspace = torch.empty(g.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    g.execute(variant_pack, workspace, handle=cudnn_handle)
+
+    # not doing comparison because A and A_ref are not close
diff --git a/third_party/cudnn-frontend/test/python/test_matmul_bias_relu.py b/third_party/cudnn-frontend/test/python/test_matmul_bias_relu.py
new file mode 100644
index 00000000..2bc00076
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_matmul_bias_relu.py
@@ -0,0 +1,269 @@
+import cudnn
+import itertools
+import pytest
+import torch
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.bool:
+        return cudnn.data_type.BOOLEAN
+    elif torch_type == torch.uint8:
+        return cudnn.data_type.UINT8
+    elif torch_type == torch.int8:
+        return cudnn.data_type.INT8
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+def get_cc():
+    major, minor = torch.cuda.get_device_capability()
+    return major * 10 + minor
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9.6",
+    reason="requires cudnn 8.9.6 or higher",
+)
+@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="requires Hopper or newer arch")
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_int8_bf16_matmul(cudnn_handle):
+
+    # matmul problem size
+    B, M, N, K = 16, 32, 64, 128
+
+    # Initialize input tensors
+    A_gpu = torch.randint(3, (B, M, K), requires_grad=False, device="cuda", dtype=torch.int8) - 2
+    B_gpu = 3 * torch.randn(B, K, N, requires_grad=False, device="cuda", dtype=torch.bfloat16) - 1.25
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Make cudnn graph
+    graph = cudnn.pygraph(handle=cudnn_handle)
+
+    # Create the two non-virtual input tensors A and B.
+    # There are read from global memory.
+    A = graph.tensor_like(A_gpu)
+    B = graph.tensor_like(B_gpu)
+
+    # Cast the input tensors to required mma precision
+    A_casted = graph.identity(input=A, compute_data_type=cudnn.data_type.FLOAT)
+    A_casted.set_data_type(cudnn.data_type.BFLOAT16)
+
+    C = graph.matmul(name="matmul", A=A_casted, B=B, compute_data_type=cudnn.data_type.FLOAT)
+    C.set_output(True).set_data_type(cudnn.data_type.BFLOAT16)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    # Run pyt reference
+    C_expected = torch.matmul(A_gpu.to(torch.bfloat16), B_gpu.to(torch.bfloat16))
+
+    # Run cudnn graph
+    C_actual = torch.zeros_like(C_expected)
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute({A: A_gpu, B: B_gpu, C: C_actual}, workspace, handle=cudnn_handle)
+
+    torch.cuda.synchronize()
+    # compare'em
+    torch.testing.assert_close(C_expected, C_actual)
+
+
+A_data_type_options = [torch.int8, torch.bfloat16, torch.float16]
+B_data_type_options = [torch.int8, torch.bfloat16, torch.float16]
+MMA_data_type_options = [torch.bfloat16, torch.float16, torch.float32]
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9.6",
+    reason="requires cudnn 8.9.6 or higher",
+)
+@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="requires Hopper or newer arch")
+@pytest.mark.parametrize("A_data_type", A_data_type_options)
+@pytest.mark.parametrize("B_data_type", B_data_type_options)
+@pytest.mark.parametrize("MMA_data_type", MMA_data_type_options)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type, cudnn_handle):
+
+    # matmul problem size
+    B, M, N, K = 16, 32, 64, 128
+
+    # Initialize input tensors
+    if A_data_type != torch.int8:
+        A_gpu = 2 * torch.randn(B, M, K, requires_grad=False, device="cuda", dtype=A_data_type) - 0.5
+    else:
+        A_gpu = torch.randint(4, (B, M, K), requires_grad=False, device="cuda", dtype=A_data_type) - 1
+
+    if B_data_type != torch.int8:
+        B_gpu_strided = 3 * torch.randn(B, K, N, requires_grad=False, device="cuda", dtype=B_data_type) - 1.25
+    else:
+        B_gpu_strided = torch.randint(3, (B, K, N), requires_grad=False, device="cuda", dtype=B_data_type).contiguous() - 2
+
+    B_gpu = torch.as_strided(B_gpu_strided, (B, K, N), (N * K, 1, N))
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Make cudnn graph
+    graph = cudnn.pygraph(handle=cudnn_handle)
+
+    # Create the two non-virtual input tensors A and B.
+    # There are read from global memory.
+    A = graph.tensor_like(A_gpu)
+    B = graph.tensor_like(B_gpu)
+
+    # Cast the input tensors to required mma precision
+    A_casted = graph.identity(input=A, compute_data_type=cudnn.data_type.FLOAT)
+    A_casted.set_data_type(convert_to_cudnn_type(MMA_data_type))
+
+    # Casting input tensor B is only supported from cudnn v9
+    if B_data_type != MMA_data_type and LooseVersion(cudnn.backend_version_string()) < "9":
+        pytest.skip("mixed precision on B only supported from cudnn v9.")
+
+    if LooseVersion(cudnn.backend_version_string()) < "9":
+        # Do not create a cast node
+        B_casted = B
+    else:
+        # Cast the input tensors to required mma precision
+        B_casted = graph.identity(input=B, compute_data_type=cudnn.data_type.FLOAT)
+        B_casted.set_data_type(convert_to_cudnn_type(MMA_data_type))
+
+    # CAUTION: Hardcodes to fp32 as tests today dont cover inputs that are casted to ints.
+    # In case your usecase does cast inputs to int8, use int32 as compute type here.
+    C = graph.matmul(name="matmul", A=A_casted, B=B_casted, compute_data_type=cudnn.data_type.FLOAT)
+    C.set_output(True).set_data_type(convert_to_cudnn_type(MMA_data_type))
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    # Run pyt reference
+    C_expected = torch.matmul(A_gpu.to(MMA_data_type), B_gpu.to(MMA_data_type))
+
+    # Run cudnn graph
+    C_actual = torch.zeros_like(C_expected)
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute({A: A_gpu, B: B_gpu, C: C_actual}, workspace, handle=cudnn_handle)
+
+    torch.cuda.synchronize()
+    # compare'em
+    torch.testing.assert_close(C_expected, C_actual, atol=1e-4, rtol=1e-4)
+
+
+problem_size_options = [(1, 128, 768), (16, 512, 1600), (1, 128, 1024)]
+input_type_options = [torch.bfloat16, torch.float16]
+
+all_options = [elem for elem in itertools.product(*[problem_size_options, input_type_options])]
+
+
+@pytest.fixture(params=all_options)
+def param_extract(request):
+    return request.param
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_matmul_bias(param_extract, cudnn_handle):
+
+    problem_size_options, input_type = param_extract
+    b, s, e = problem_size_options
+
+    if b > 1 and LooseVersion(cudnn.backend_version_string()) < "8.9.6":
+        pytest.skip("matmul broadcast only supported 8.9.6 onwards.")
+
+    # Regression in cudnn backend where ampere does not support matmul broadcast starting 8.9.6
+    if b > 1 and torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("matmul broadcast on ampere with 8.9.6 is not supported.")
+
+    X_gpu = torch.randn(b, s, e, requires_grad=False, device="cuda", dtype=input_type)
+    W_gpu = torch.randn(1, e, e * 4, requires_grad=False, device="cuda", dtype=input_type)
+    B_gpu = torch.randn(1, 1, e * 4, requires_grad=False, device="cuda", dtype=input_type)
+    Y_expected = torch.nn.functional.linear(X_gpu, W_gpu.squeeze().T, bias=B_gpu.squeeze())
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor(
+        name="X",
+        dim=X_gpu.size(),
+        stride=X_gpu.stride(),
+        data_type=convert_to_cudnn_type(input_type),
+    )
+    W = graph.tensor(
+        name="W",
+        dim=W_gpu.size(),
+        stride=W_gpu.stride(),
+        data_type=convert_to_cudnn_type(input_type),
+    )
+    B = graph.tensor(
+        name="B",
+        dim=B_gpu.size(),
+        stride=B_gpu.stride(),
+        data_type=convert_to_cudnn_type(input_type),
+    )
+
+    response = graph.matmul(name="matmul", A=X, B=W)
+    Y = graph.bias(name="bias", input=response, bias=B)
+    Y.set_output(True).set_data_type(convert_to_cudnn_type(input_type))
+
+    graph.validate()
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+
+    # test code to make sure behaviour notes are returned correctly
+    notes = graph.get_behavior_notes_for_plan_at_index(0)
+    assert cudnn.behavior_note.RUNTIME_COMPILATION in notes
+
+    graph.check_support()
+    graph.build_plans()
+
+    # test code to make sure behaviour notes are returned correctly
+    notes = graph.get_behavior_notes()
+    assert cudnn.behavior_note.RUNTIME_COMPILATION in notes
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    Y_actual = torch.zeros_like(Y_expected)
+
+    graph.execute({X: X_gpu, W: W_gpu, B: B_gpu, Y: Y_actual}, workspace, handle=cudnn_handle)
+
+    atol = 0.0625 if get_cc() == 89 else 1e-3
+    rtol = 1e-2 if input_type == torch.bfloat16 else 1e-3
+
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(Y_expected, Y_actual, atol=atol, rtol=rtol)
diff --git a/third_party/cudnn-frontend/test/python/test_matmul_fuzzer.py b/third_party/cudnn-frontend/test/python/test_matmul_fuzzer.py
new file mode 100644
index 00000000..a4133c34
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_matmul_fuzzer.py
@@ -0,0 +1,1046 @@
+"""
+Matmul Fuzzer - Randomized stress testing for cuDNN matmul operations.
+
+This fuzzer tests matmul operations with randomized:
+- Shapes (batch, M, N, K dimensions)
+- Layouts (row-major, column-major, transposed, strided)
+- Data types (fp16, bf16, fp32, int8)
+- Epilogues (none, bias, relu, gelu)
+
+Run with:
+    pytest -vv -s -rA test_matmul_fuzzer.py
+
+Options:
+    --num-tests N       Number of random tests to run (default: 100)
+    --seed N            Random seed for reproducibility (default: random)
+    --diffs N           Number of mismatches to display (default: 10)
+"""
+
+import cudnn
+import pytest
+import random
+import torch
+import math
+import os
+import sys
+import signal
+from looseversion import LooseVersion
+from datetime import datetime
+from dataclasses import dataclass, asdict
+from typing import Optional, Tuple, List
+from enum import IntEnum
+
+# fmt: off
+
+# Handle Ctrl-C gracefully
+def signal_handler(sig, frame):
+    print("\n\nInterrupted by user (Ctrl-C), exiting...")
+    # Force CUDA to sync and cleanup
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    sys.exit(1)
+
+signal.signal(signal.SIGINT, signal_handler)
+
+if __name__ == "__main__":
+    print("This is pytest script. Run with: pytest -vv -s -rA test_matmul_fuzzer.py")
+    sys.exit(0)
+
+
+# ============================================================================
+# Configuration and Constants
+# ============================================================================
+
+class LayoutType(IntEnum):
+    ROW_MAJOR_PACKED = 0   # Standard row-major packed (strides: [..., N, 1])
+    COL_MAJOR_PACKED = 1   # Column-major packed (strides: [..., 1, M])
+    STRIDED = 2            # Custom strided layout with gaps
+
+class EpilogueType(IntEnum):
+    NONE = 0
+    BIAS = 1
+    RELU = 2
+    BIAS_RELU = 3
+    GELU = 4
+    BIAS_GELU = 5
+
+SUPPORTED_DTYPES = [
+    torch.float16,
+    torch.bfloat16,
+    torch.float32,
+    torch.int8,
+]
+
+# Compute precisions
+COMPUTE_DTYPES = [
+    cudnn.data_type.FLOAT,
+    cudnn.data_type.HALF,
+    cudnn.data_type.BFLOAT16,
+]
+
+
+# ============================================================================
+# Utility Functions
+# ============================================================================
+
+def convert_to_cudnn_type(torch_type):
+    """Convert PyTorch dtype to cuDNN data type."""
+    mapping = {
+        torch.float16: cudnn.data_type.HALF,
+        torch.bfloat16: cudnn.data_type.BFLOAT16,
+        torch.float32: cudnn.data_type.FLOAT,
+        torch.bool: cudnn.data_type.BOOLEAN,
+        torch.uint8: cudnn.data_type.UINT8,
+        torch.int8: cudnn.data_type.INT8,
+        torch.int32: cudnn.data_type.INT32,
+        torch.int64: cudnn.data_type.INT64,
+    }
+    if torch_type not in mapping:
+        raise ValueError(f"Unsupported tensor data type: {torch_type}")
+    return mapping[torch_type]
+
+
+def get_gpu_arch():
+    """Get GPU SM architecture version."""
+    major, minor = torch.cuda.get_device_capability()
+    return f"SM_{major * 10 + minor}"
+
+
+def get_sm_count():
+    """Get number of SMs on the GPU."""
+    props = torch.cuda.get_device_properties(0)
+    return props.multi_processor_count
+
+
+def get_gpu_name():
+    """Get GPU name."""
+    return torch.cuda.get_device_name()
+
+
+def layout_name(layout: LayoutType) -> str:
+    """Get human-readable layout name."""
+    names = {
+        LayoutType.ROW_MAJOR_PACKED: "row_major_packed",
+        LayoutType.COL_MAJOR_PACKED: "col_major_packed",
+        LayoutType.STRIDED: "strided",
+    }
+    return names.get(layout, "unknown")
+
+
+def epilogue_name(epilogue: EpilogueType) -> str:
+    """Get human-readable epilogue name."""
+    names = {
+        EpilogueType.NONE: "none",
+        EpilogueType.BIAS: "bias",
+        EpilogueType.RELU: "relu",
+        EpilogueType.BIAS_RELU: "bias_relu",
+        EpilogueType.GELU: "gelu",
+        EpilogueType.BIAS_GELU: "bias_gelu",
+    }
+    return names.get(epilogue, "unknown")
+
+
+def compute_strides(shape: Tuple[int, ...], layout: LayoutType, rng: random.Random) -> Tuple[int, ...]:
+    """Compute strides for a given shape and layout."""
+    ndim = len(shape)
+
+    if layout == LayoutType.ROW_MAJOR_PACKED:
+        # Standard row-major: last dim has stride 1
+        strides = []
+        stride = 1
+        for dim in reversed(shape):
+            strides.insert(0, stride)
+            stride *= dim
+        return tuple(strides)
+
+    elif layout == LayoutType.COL_MAJOR_PACKED:
+        # Column-major for the last two dimensions
+        if ndim < 2:
+            return (1,)
+        strides = [1] * ndim
+        # Last two dims are transposed
+        strides[-1] = shape[-2]  # N stride = M
+        strides[-2] = 1          # M stride = 1
+        # Batch dimensions
+        stride = shape[-1] * shape[-2]
+        for i in range(ndim - 3, -1, -1):
+            strides[i] = stride
+            stride *= shape[i]
+        return tuple(strides)
+
+    elif layout == LayoutType.STRIDED:
+        # Random strided layout with potential gaps
+        strides = []
+        stride = 1
+        for dim in reversed(shape):
+            # Add random padding (1-4x the minimum stride)
+            padding_factor = rng.choice([1, 1, 1, 2, 2, 4])
+            strides.insert(0, stride)
+            stride *= dim * padding_factor
+        return tuple(strides)
+
+    return tuple([1] * ndim)
+
+
+def compute_num_elements(shape: Tuple[int, ...], strides: Tuple[int, ...]) -> int:
+    """Compute number of elements needed for storage given shape and strides."""
+    if not shape:
+        return 1
+    max_offset = sum((d - 1) * s for d, s in zip(shape, strides))
+    return max_offset + 1
+
+
+def fill_with_garbage(tensor: torch.Tensor, nan_probability: float = 0.1) -> None:
+    """
+    Fill tensor with garbage values (mix of random values and NaNs).
+    This helps catch bugs where cuDNN doesn't write all output locations.
+    """
+    # Choose range based on dtype to avoid overflow
+    if tensor.dtype in (torch.float16, torch.bfloat16):
+        lo, hi = -1e4, 1e4  # FP16 max is ~65504
+    else:
+        lo, hi = -1e6, 1e6
+
+    # Fill with random garbage
+    tensor.uniform_(lo, hi)
+
+    # Sprinkle in some NaNs (only for float types)
+    if nan_probability > 0 and tensor.dtype in (torch.float16, torch.bfloat16, torch.float32, torch.float64):
+        nan_mask = torch.rand(tensor.shape, device=tensor.device) < nan_probability
+        tensor[nan_mask] = float('nan')
+
+
+# ============================================================================
+# Test Configuration
+# ============================================================================
+
+@dataclass
+class MatmulConfig:
+    """Configuration for a single matmul test."""
+    # Basic dimensions
+    batch: int
+    M: int
+    N: int
+    K: int
+
+    # Data types
+    a_dtype: torch.dtype
+    b_dtype: torch.dtype
+    c_dtype: torch.dtype
+    compute_dtype: cudnn.data_type
+
+    # Layouts
+    a_layout: LayoutType
+    b_layout: LayoutType
+    c_layout: LayoutType
+
+    # Transpose flags
+    a_transposed: bool
+    b_transposed: bool
+
+    # Epilogue
+    epilogue: EpilogueType
+
+    # Random seed for data generation
+    rng_seed: int
+
+    # Computed strides (set during tensor creation)
+    a_shape: Tuple[int, ...] = None
+    b_shape: Tuple[int, ...] = None
+    c_shape: Tuple[int, ...] = None
+    a_strides: Tuple[int, ...] = None
+    b_strides: Tuple[int, ...] = None
+    c_strides: Tuple[int, ...] = None
+    a_elems: int = 0
+    b_elems: int = 0
+    c_elems: int = 0
+
+    # Bias tensor info (set when epilogue uses bias)
+    bias_shape: Tuple[int, ...] = None
+    bias_strides: Tuple[int, ...] = None
+    bias_elems: int = 0
+
+    def to_repro_dict(self) -> dict:
+        """Convert config to reproducible dictionary."""
+        return {
+            'batch': self.batch,
+            'M': self.M,
+            'N': self.N,
+            'K': self.K,
+            'a_dtype': str(self.a_dtype),
+            'b_dtype': str(self.b_dtype),
+            'c_dtype': str(self.c_dtype),
+            'epilogue': int(self.epilogue),
+            'rng_seed': self.rng_seed,
+        }
+
+
+class ConfigGenerator:
+    """Generator for random matmul configurations."""
+
+    def __init__(self, seed: int, allow_unaligned: bool = False):
+        self.rng = random.Random(seed)
+        self.allow_unaligned = allow_unaligned
+
+    def random_batch(self) -> int:
+        """Generate random batch size (no alignment requirement)."""
+        return self.rng.choice([1, 1, 2, 3, 4, 5, 7, 8, 16, 32])
+
+    def random_dim(self, min_val: int = 1, max_val: int = 4096) -> int:
+        """Generate random dimension size (M, N, or K)."""
+        val = self.rng.randint(int(math.sqrt(min_val)), int(math.sqrt(max_val)))
+        if self.allow_unaligned:
+            # Allow non-aligned sizes for stress testing
+            return val * val
+        else:
+            # Default: round up to next multiple of 8 for tensor core alignment
+            return ((val * val + 7) // 8) * 8
+
+    def random_dtype(self) -> torch.dtype:
+        """Generate random data type."""
+        return self.rng.choice(SUPPORTED_DTYPES)
+
+    def random_layout(self) -> LayoutType:
+        """Generate random layout type."""
+        # Prefer row-major but test others too
+        weights = [0.6, 0.2, 0.2]
+        return self.rng.choices(list(LayoutType), weights=weights)[0]
+
+    def random_epilogue(self) -> EpilogueType:
+        """Generate random epilogue type."""
+        # Most tests without epilogue
+        weights = [0.5, 0.15, 0.1, 0.1, 0.075, 0.075]
+        return self.rng.choices(list(EpilogueType), weights=weights)[0]
+
+    def random_compute_dtype(self) -> cudnn.data_type:
+        """Generate random compute data type."""
+        return self.rng.choice([cudnn.data_type.FLOAT])  # Float is most compatible
+
+    def generate(self) -> MatmulConfig:
+        """Generate a random matmul configuration."""
+        batch = self.random_batch()
+        M = self.random_dim()
+        N = self.random_dim()
+        K = self.random_dim()
+
+        # Data types - ensure compatible combinations
+        if self.rng.random() < 0.8:
+            # 80% of tests use same dtype for A and B (more stable)
+            a_dtype = self.random_dtype()
+            b_dtype = a_dtype
+        else:
+            # 20% test mixed precision
+            a_dtype = self.random_dtype()
+            b_dtype = self.random_dtype()
+
+        # Output dtype should be float type (not int8)
+        if a_dtype == torch.float32 or b_dtype == torch.float32:
+            c_dtype = torch.float32
+        elif a_dtype == torch.int8 or b_dtype == torch.int8:
+            # int8 inputs typically output to float32
+            c_dtype = torch.float32
+        else:
+            c_dtype = self.rng.choice([a_dtype, torch.float32])
+
+        # Layout selection - prefer row-major for stability
+        a_layout = self.rng.choices(
+            [LayoutType.ROW_MAJOR_PACKED, LayoutType.COL_MAJOR_PACKED, LayoutType.STRIDED],
+            weights=[0.7, 0.15, 0.15]
+        )[0]
+        b_layout = self.rng.choices(
+            [LayoutType.ROW_MAJOR_PACKED, LayoutType.COL_MAJOR_PACKED, LayoutType.STRIDED],
+            weights=[0.7, 0.15, 0.15]
+        )[0]
+
+        # Transpose flags - disabled for now, using layouts instead for variety
+        # cuDNN matmul expects specific input layouts, transpose requires extra handling
+        a_transposed = False
+        b_transposed = False
+
+        # Epilogue selection
+        epilogue = self.random_epilogue()
+
+        config = MatmulConfig(
+            batch=batch,
+            M=M,
+            N=N,
+            K=K,
+            a_dtype=a_dtype,
+            b_dtype=b_dtype,
+            c_dtype=c_dtype,
+            compute_dtype=self.random_compute_dtype(),
+            a_layout=a_layout,
+            b_layout=b_layout,
+            c_layout=LayoutType.ROW_MAJOR_PACKED,  # Output usually row-major
+            a_transposed=a_transposed,
+            b_transposed=b_transposed,
+            epilogue=epilogue,
+            rng_seed=self.rng.randint(0, 2**31 - 1),
+        )
+
+        return config
+
+
+# ============================================================================
+# Test Execution
+# ============================================================================
+
+def create_tensors(config: MatmulConfig, rng: random.Random):
+    """Create input and output tensors based on configuration."""
+    torch_rng = torch.Generator(device='cuda')
+    torch_rng.manual_seed(config.rng_seed)
+
+    # Compute shapes for matmul: C = A @ B
+    # A: (batch, M, K)
+    # B: (batch, K, N)
+    # C: (batch, M, N)
+    #
+    # Transpose flags affect storage layout, not logical dimensions:
+    # - If a_transposed: A stored as (batch, K, M), transposed for matmul to (batch, M, K)
+    # - If b_transposed: B stored as (batch, N, K), transposed for matmul to (batch, K, N)
+    #
+    # For simplicity in this fuzzer, we don't use transpose - just vary layouts instead
+
+    a_shape = (config.batch, config.M, config.K)
+    b_shape = (config.batch, config.K, config.N)
+    c_shape = (config.batch, config.M, config.N)
+
+    # Compute strides
+    a_strides = compute_strides(a_shape, config.a_layout, rng)
+    b_strides = compute_strides(b_shape, config.b_layout, rng)
+    c_strides = compute_strides(c_shape, config.c_layout, rng)
+
+    # Compute number of elements
+    a_elems = compute_num_elements(a_shape, a_strides)
+    b_elems = compute_num_elements(b_shape, b_strides)
+    c_elems = compute_num_elements(c_shape, c_strides)
+
+    # Update config with computed values
+    config.a_shape = a_shape
+    config.b_shape = b_shape
+    config.c_shape = c_shape
+    config.a_strides = a_strides
+    config.b_strides = b_strides
+    config.c_strides = c_strides
+    config.a_elems = a_elems
+    config.b_elems = b_elems
+    config.c_elems = c_elems
+
+    # Create tensors
+    if config.a_dtype == torch.int8:
+        a_storage = torch.randint(-2, 3, (a_elems,), device='cuda', dtype=torch.int8)
+    else:
+        a_storage = torch.empty(a_elems, device='cuda', dtype=config.a_dtype)
+        a_storage.normal_(mean=0.5, std=0.5, generator=torch_rng)
+
+    if config.b_dtype == torch.int8:
+        b_storage = torch.randint(-2, 3, (b_elems,), device='cuda', dtype=torch.int8)
+    else:
+        b_storage = torch.empty(b_elems, device='cuda', dtype=config.b_dtype)
+        b_storage.normal_(mean=0.5, std=0.5,generator=torch_rng)
+
+    # Create strided views
+    A = torch.as_strided(a_storage, a_shape, a_strides)
+    B = torch.as_strided(b_storage, b_shape, b_strides)
+
+    # Output tensor - fill with garbage to catch bugs where cuDNN doesn't write all outputs
+    c_storage = torch.empty(c_elems, device='cuda', dtype=config.c_dtype)
+    fill_with_garbage(c_storage)
+    C = torch.as_strided(c_storage, c_shape, c_strides)
+
+    # Bias tensor if needed
+    bias = None
+    if config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU, EpilogueType.BIAS_GELU]:
+        # Randomize bias shape: each dim can be 1 (broadcast) or match C
+        bias_rng = random.Random(config.rng_seed + 1)  # Different seed for bias shape
+        bias_b = bias_rng.choice([1, config.batch])
+        bias_m = bias_rng.choice([1, config.M])
+        bias_n = bias_rng.choice([1, config.N])
+        bias_shape = (bias_b, bias_m, bias_n)
+        bias_strides = compute_strides(bias_shape, LayoutType.ROW_MAJOR_PACKED, bias_rng)
+        bias_elems = compute_num_elements(bias_shape, bias_strides)
+
+        config.bias_shape = bias_shape
+        config.bias_strides = bias_strides
+        config.bias_elems = bias_elems
+
+        bias = torch.empty(bias_elems, device='cuda', dtype=config.c_dtype)
+        bias.normal_(mean=0.5, std=0.5, generator=torch_rng)
+        bias = torch.as_strided(bias, bias_shape, bias_strides)
+
+    return A, B, C, bias
+
+
+def compute_reference(config: MatmulConfig, A: torch.Tensor, B: torch.Tensor, bias: Optional[torch.Tensor]):
+    """Compute reference result using PyTorch."""
+    # Convert to float for computation
+    compute_dtype = torch.float32
+
+    A_compute = A.to(compute_dtype)
+    B_compute = B.to(compute_dtype)
+
+    try:
+        # Matmul: C = A @ B
+        # A: (batch, M, K), B: (batch, K, N), C: (batch, M, N)
+        C_ref = torch.matmul(A_compute, B_compute)
+
+        # Epilogue
+        if bias is not None and config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU, EpilogueType.BIAS_GELU]:
+            C_ref = C_ref + bias.to(compute_dtype)
+
+        if config.epilogue in [EpilogueType.RELU, EpilogueType.BIAS_RELU]:
+            C_ref = torch.relu(C_ref)
+        elif config.epilogue in [EpilogueType.GELU, EpilogueType.BIAS_GELU]:
+            C_ref = torch.nn.functional.gelu(C_ref)
+
+        return C_ref.to(config.c_dtype)
+    finally:
+        del A_compute, B_compute
+
+
+def run_cudnn_matmul(config: MatmulConfig, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
+                     bias: Optional[torch.Tensor], cudnn_handle) -> Tuple[bool, str]:
+    """Run matmul using cuDNN and return success status and message."""
+    try:
+        stream = torch.cuda.current_stream().cuda_stream
+        cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+        # Create graph
+        graph = cudnn.pygraph(
+            handle=cudnn_handle,
+            compute_data_type=config.compute_dtype,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+        )
+
+        # Create input tensors
+        A_tensor = graph.tensor_like(A)
+        B_tensor = graph.tensor_like(B)
+
+        # Handle data type casting if needed
+        mma_dtype = convert_to_cudnn_type(config.c_dtype)
+
+        if config.a_dtype != config.c_dtype:
+            A_casted = graph.identity(input=A_tensor, compute_data_type=cudnn.data_type.FLOAT)
+            A_casted.set_data_type(mma_dtype)
+        else:
+            A_casted = A_tensor
+
+        if config.b_dtype != config.c_dtype:
+            B_casted = graph.identity(input=B_tensor, compute_data_type=cudnn.data_type.FLOAT)
+            B_casted.set_data_type(mma_dtype)
+        else:
+            B_casted = B_tensor
+
+        # Matmul
+        result = graph.matmul(name="matmul", A=A_casted, B=B_casted)
+
+        # Epilogue
+        if bias is not None and config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU, EpilogueType.BIAS_GELU]:
+            bias_tensor = graph.tensor_like(bias)
+            result = graph.bias(name="bias", input=result, bias=bias_tensor)
+        else:
+            bias_tensor = None
+
+        if config.epilogue in [EpilogueType.RELU, EpilogueType.BIAS_RELU]:
+            result = graph.relu(name="relu", input=result)
+        elif config.epilogue in [EpilogueType.GELU, EpilogueType.BIAS_GELU]:
+            result = graph.gelu(name="gelu", input=result)
+
+        result.set_output(True).set_data_type(mma_dtype)
+
+        # Build and execute
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+        # Allocate workspace and fill with garbage to catch uninitialized memory bugs
+        workspace_size = graph.get_workspace_size()
+        workspace = torch.empty(workspace_size, device='cuda', dtype=torch.uint8)
+        if workspace_size > 0:
+            # Fill with random garbage + some NaN patterns to test proper workspace init
+            workspace.random_(0, 256)
+            nan_mask = torch.rand(workspace_size, device='cuda') < 0.1
+            workspace[nan_mask] = 0xFF
+
+        # Build variant pack
+        variant_pack = {A_tensor: A, B_tensor: B, result: C}
+        if bias_tensor is not None:
+            variant_pack[bias_tensor] = bias
+
+        # Execute
+        graph.execute(variant_pack, workspace, handle=cudnn_handle)
+        torch.cuda.synchronize()
+
+        return True, "success"
+
+    except cudnn.cudnnGraphNotSupportedError as e:
+        return False, f"graph not supported: {e}"
+    except Exception as e:
+        return False, f"error: {e}"
+
+
+def compare_results(C_actual: torch.Tensor, C_expected: torch.Tensor, config: MatmulConfig,
+                    max_diffs: int = 10) -> Tuple[bool, int, str]:
+    """Compare actual and expected results."""
+    # Determine tolerances based on dtype
+    # Note: cuDNN uses TF32 for FP32 tensor core ops, which has same precision as FP16 (10-bit mantissa)
+    if config.c_dtype == torch.float32:
+        rtol, atol = 1e-2, 2e-2  # TF32 precision, not full FP32
+    elif config.c_dtype == torch.float16:
+        rtol, atol = 1e-2, 2e-2
+    else:  # bfloat16
+        rtol, atol = 1e-2, 2e-2
+
+    # Scale tolerances based on problem size (larger problems accumulate more error)
+    # Use max(1.0, ...) to only increase tolerances for large K, never decrease
+    scale_factor = max(1.0, math.sqrt(config.K / 128.0))
+    rtol *= scale_factor
+    atol *= scale_factor
+
+    try:
+        torch.testing.assert_close(C_actual, C_expected, rtol=rtol, atol=atol)
+        return True, 0, f"Numerical divergence within limits (rtol={rtol:.2e}, atol={atol:.2e})"
+    except AssertionError:
+        # Count mismatches
+        close_mask = torch.isclose(C_actual.float(), C_expected.float(), rtol=rtol, atol=atol)
+        mismatch_count = (~close_mask).sum().item()
+        total_elements = C_actual.numel()
+        percentage = 100.0 * mismatch_count / total_elements
+
+        msg = f"Found {mismatch_count:,} mismatches ({percentage:.2f}%) out of {total_elements:,} elements"
+
+        # Show some mismatches
+        if max_diffs > 0:
+            mismatches = torch.where(~close_mask)
+            for i in range(min(max_diffs, mismatch_count)):
+                idx = tuple(m[i].item() for m in mismatches)
+                actual = C_actual[idx].item()
+                expected = C_expected[idx].item()
+                diff = actual - expected
+                msg += f"\n  idx{idx}: actual={actual:+.6e}, expected={expected:+.6e}, diff={diff:+.2e}"
+
+        return False, mismatch_count, msg
+
+
+# ============================================================================
+# Test Output Formatting
+# ============================================================================
+
+def format_test_header(test_num: int, total_tests: int, config: MatmulConfig) -> str:
+    """Format test header similar to sample log."""
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    gpu_arch = get_gpu_arch()
+    gpu_name = get_gpu_name()
+    sm_count = get_sm_count()
+    cudnn_ver = cudnn.backend_version()
+
+    lines = [
+        "",
+        "=" * 90,
+        f"#### Test #{test_num} of {total_tests} at {timestamp} ",
+        "",
+        f"test_name        = test_matmul_fuzzer[test{test_num}]",
+        f"platform_info    = {gpu_arch} ({sm_count} SM-s, {gpu_name}), cudnn_ver={cudnn_ver}",
+        f"rng_data_seed    = {config.rng_seed}",
+        f"basic_dims       = [b={config.batch}, M={config.M}, N={config.N}, K={config.K}]",
+        f"matrix_a(b,m,k)  = dim={config.a_shape}, strides={config.a_strides}, elems={config.a_elems}, type={config.a_dtype}",
+        f"matrix_b(b,k,n)  = dim={config.b_shape}, strides={config.b_strides}, elems={config.b_elems}, type={config.b_dtype}",
+        f"matrix_c(b,m,n)  = dim={config.c_shape}, strides={config.c_strides}, elems={config.c_elems}, type={config.c_dtype}",
+    ]
+
+    # Add bias info if epilogue uses bias
+    if config.epilogue in [EpilogueType.BIAS, EpilogueType.BIAS_RELU, EpilogueType.BIAS_GELU] and config.bias_shape:
+        lines.append(f"bias(b,m,n)      = dim={config.bias_shape}, strides={config.bias_strides}, elems={config.bias_elems}, type={config.c_dtype}")
+
+    lines += [
+        f"epilogue         = {epilogue_name(config.epilogue)}",
+        f"repro_cmd        = pytest -vv -s -rA {__file__}::test_repro --repro \"{config.to_repro_dict()}\"",
+        " ",
+    ]
+    return "\n".join(lines)
+
+
+def format_test_result(passed: bool, message: str) -> str:
+    """Format test result."""
+    lines = [
+        f"%%%% {message}",
+    ]
+    if passed:
+        lines.append("@@@@ Overall result: PASSED, everything looks good!")
+    else:
+        lines.append("@@@@ Overall result: FAILED")
+    return "\n".join(lines)
+
+
+# ============================================================================
+# PyTest Infrastructure
+# ============================================================================
+
+def pytest_addoption(parser):
+    """Add custom command line options."""
+    try:
+        parser.addoption("--num-tests", action="store", type=int, default=100,
+                        help="Number of random tests to run")
+        parser.addoption("--fuzz-seed", action="store", type=int, default=None,
+                        help="Random seed for test generation")
+        parser.addoption("--unaligned", action="store_true", default=False,
+                        help="Allow M/N/K dimensions that are not multiples of 8")
+    except Exception:
+        pass  # Options may already be added
+
+
+def tlist(*, num_tests: int, rng_seed: int):
+    """Generate list of test parameters (legacy, without pre-generated configs)."""
+    rng = random.Random(rng_seed)
+    return [(i + 1, num_tests, rng.randint(65536, 2**31 - 1)) for i in range(num_tests)]
+
+
+def tlist_with_configs(*, num_tests: int, rng_seed: int, allow_unaligned: bool = False):
+    """Generate list of test parameters with pre-generated configs for descriptive test names."""
+    rng = random.Random(rng_seed)
+    params = []
+    for i in range(num_tests):
+        config_seed = rng.randint(65536, 2**31 - 1)
+        generator = ConfigGenerator(config_seed, allow_unaligned=allow_unaligned)
+        config = generator.generate()
+        params.append((i + 1, num_tests, config_seed, config))
+    return params
+
+
+def make_test_id(param, prefix: str = "t"):
+    """Create descriptive test ID from pre-generated config."""
+    test_num, total_tests, config_seed, config = param
+    dtype_short = {
+        torch.float16: 'f16',
+        torch.bfloat16: 'bf16',
+        torch.float32: 'f32',
+        torch.int8: 'i8',
+    }
+    dt = dtype_short.get(config.a_dtype, 'unk')
+    epi = epilogue_name(config.epilogue)[:4]  # Truncate epilogue name
+    return f"{prefix}{test_num}_b{config.batch}_M{config.M}xN{config.N}xK{config.K}_{dt}_{epi}"
+
+
+# Generate test list
+def get_test_params(request):
+    """Get test parameters from pytest config."""
+    try:
+        num_tests = request.config.getoption("--num-tests", default=100)
+        seed = request.config.getoption("--fuzz-seed", default=None)
+    except Exception:
+        num_tests = 100
+        seed = None
+
+    if seed is None:
+        seed = random.randint(0, 2**31 - 1)
+
+    return num_tests, seed
+
+
+# Fixed test list for default runs
+DEFAULT_NUM_TESTS = 2048
+DEFAULT_SEED = 42
+TEST_PARAMS = tlist_with_configs(num_tests=DEFAULT_NUM_TESTS, rng_seed=DEFAULT_SEED)
+
+
+@pytest.mark.L0
+@pytest.mark.parametrize("test_num,total_tests,config_seed,config", TEST_PARAMS,
+                        ids=[make_test_id(p) for p in TEST_PARAMS])
+def test_matmul_fuzz(test_num: int, total_tests: int, config_seed: int, config: MatmulConfig, cudnn_handle, request):
+    """Fuzz test for matmul operations (M/N/K aligned to multiples of 8)."""
+
+    # Skip if cuDNN handle not available
+    if cudnn_handle is None:
+        pytest.skip("cuDNN handle not available")
+
+    # Get display options
+    try:
+        max_diffs = request.config.getoption("--diffs", default=10)
+    except Exception:
+        max_diffs = 10
+
+    # Create tensors
+    rng = random.Random(config_seed)
+    A, B, C, bias = create_tensors(config, rng)
+    C_expected = None
+
+    try:
+        # Print test header
+        print(format_test_header(test_num, total_tests, config))
+
+        # Compute reference
+        C_expected = compute_reference(config, A, B, bias)
+
+        # Run cuDNN
+        success, msg = run_cudnn_matmul(config, A, B, C, bias, cudnn_handle)
+
+        if not success:
+            print(f"%%%% cuDNN execution failed: {msg}")
+            # Skip tests with unsupported configurations rather than failing
+            skip_keywords = ["not supported", "finalize failed", "mismatch", "invalid", "unsupported"]
+            if any(kw in msg.lower() for kw in skip_keywords):
+                print("@@@@ Overall result: SKIPPED (unsupported configuration)")
+                pytest.skip(f"Unsupported configuration: {msg}")
+            else:
+                print("@@@@ Overall result: FAILED")
+                pytest.fail(f"cuDNN execution failed: {msg}")
+
+        # Compare results
+        passed, mismatch_count, compare_msg = compare_results(C, C_expected, config, max_diffs)
+
+        print(format_test_result(passed, compare_msg))
+
+        if not passed:
+            pytest.fail(f"Numerical mismatch: {mismatch_count} elements differ")
+    finally:
+        # Explicit cleanup to prevent GPU memory accumulation
+        del A, B, C
+        if bias is not None:
+            del bias
+        if C_expected is not None:
+            del C_expected
+        torch.cuda.empty_cache()
+
+
+# Separate test list for unaligned stress testing
+UNALIGNED_TEST_PARAMS = tlist_with_configs(num_tests=1024, rng_seed=12345, allow_unaligned=True)
+
+
+@pytest.mark.L1  # L1 for stress testing with unaligned dimensions
+@pytest.mark.parametrize("test_num,total_tests,config_seed,config", UNALIGNED_TEST_PARAMS,
+                        ids=[make_test_id(p, prefix="u") for p in UNALIGNED_TEST_PARAMS])
+def test_matmul_fuzz_unaligned(test_num: int, total_tests: int, config_seed: int, config: MatmulConfig, cudnn_handle, request):
+    """Fuzz test for matmul with unaligned M/N/K dimensions (stress test)."""
+
+    # Skip if cuDNN handle not available
+    if cudnn_handle is None:
+        pytest.skip("cuDNN handle not available")
+
+    # Get display options
+    try:
+        max_diffs = request.config.getoption("--diffs", default=10)
+    except Exception:
+        max_diffs = 10
+
+    # Create tensors
+    rng = random.Random(config_seed)
+    A, B, C, bias = create_tensors(config, rng)
+    C_expected = None
+
+    try:
+        # Print test header
+        print(format_test_header(test_num, total_tests, config))
+
+        # Compute reference
+        C_expected = compute_reference(config, A, B, bias)
+
+        # Run cuDNN
+        success, msg = run_cudnn_matmul(config, A, B, C, bias, cudnn_handle)
+
+        if not success:
+            print(f"%%%% cuDNN execution failed: {msg}")
+            # Skip tests with unsupported configurations rather than failing
+            skip_keywords = ["not supported", "finalize failed", "mismatch", "invalid", "unsupported"]
+            if any(kw in msg.lower() for kw in skip_keywords):
+                print("@@@@ Overall result: SKIPPED (unsupported configuration)")
+                pytest.skip(f"Unsupported configuration: {msg}")
+            else:
+                print("@@@@ Overall result: FAILED")
+                pytest.fail(f"cuDNN execution failed: {msg}")
+
+        # Compare results
+        passed, mismatch_count, compare_msg = compare_results(C, C_expected, config, max_diffs)
+
+        print(format_test_result(passed, compare_msg))
+
+        if not passed:
+            pytest.fail(f"Numerical mismatch: {mismatch_count} elements differ")
+    finally:
+        # Explicit cleanup to prevent GPU memory accumulation
+        del A, B, C
+        if bias is not None:
+            del bias
+        if C_expected is not None:
+            del C_expected
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.L0
+def test_repro(cudnn_handle, request):
+    """Reproduction test for debugging specific configurations."""
+    repro_str = request.config.getoption("--repro", default=None)
+    if repro_str is None:
+        pytest.skip("No --repro option provided. Use: pytest test_matmul_fuzzer.py::test_repro --repro '<config_dict>'")
+
+    # Parse repro config
+    import ast
+    repro_dict = ast.literal_eval(repro_str)
+
+    # Regenerate config using the same seed (ensures identical random choices)
+    generator = ConfigGenerator(repro_dict['rng_seed'])
+    config = generator.generate()
+
+    # Override with explicit values from repro dict
+    config.batch = repro_dict['batch']
+    config.M = repro_dict['M']
+    config.N = repro_dict['N']
+    config.K = repro_dict['K']
+    config.a_dtype = eval(repro_dict['a_dtype'])
+    config.b_dtype = eval(repro_dict['b_dtype'])
+    config.c_dtype = eval(repro_dict['c_dtype'])
+    config.epilogue = EpilogueType(repro_dict['epilogue'])
+    config.rng_seed = repro_dict['rng_seed']
+
+    # Run test
+    rng = random.Random(config.rng_seed)
+    A, B, C, bias = create_tensors(config, rng)
+    C_expected = None
+
+    try:
+        print(format_test_header(1, 1, config))
+
+        C_expected = compute_reference(config, A, B, bias)
+        success, msg = run_cudnn_matmul(config, A, B, C, bias, cudnn_handle)
+
+        if not success:
+            pytest.fail(f"cuDNN execution failed: {msg}")
+
+        passed, mismatch_count, compare_msg = compare_results(C, C_expected, config, max_diffs=20)
+        print(format_test_result(passed, compare_msg))
+
+        if not passed:
+            pytest.fail(f"Numerical mismatch: {mismatch_count} elements differ")
+    finally:
+        # Explicit cleanup to prevent GPU memory accumulation
+        del A, B, C
+        if bias is not None:
+            del bias
+        if C_expected is not None:
+            del C_expected
+        torch.cuda.empty_cache()
+
+
+# ============================================================================
+# Quick Sanity Tests
+# ============================================================================
+
+@pytest.mark.L0
+def test_matmul_basic_fp16(cudnn_handle):
+    """Basic FP16 matmul sanity test."""
+    if cudnn_handle is None:
+        pytest.skip("cuDNN handle not available")
+
+    config = MatmulConfig(
+        batch=2, M=64, N=128, K=256,
+        a_dtype=torch.float16, b_dtype=torch.float16, c_dtype=torch.float16,
+        compute_dtype=cudnn.data_type.FLOAT,
+        a_layout=LayoutType.ROW_MAJOR_PACKED, b_layout=LayoutType.ROW_MAJOR_PACKED, c_layout=LayoutType.ROW_MAJOR_PACKED,
+        a_transposed=False, b_transposed=False,
+        epilogue=EpilogueType.NONE,
+        rng_seed=12345,
+    )
+
+    rng = random.Random(config.rng_seed)
+    A, B, C, bias = create_tensors(config, rng)
+    C_expected = None
+
+    try:
+        print(format_test_header(1, 1, config))
+
+        C_expected = compute_reference(config, A, B, bias)
+        success, msg = run_cudnn_matmul(config, A, B, C, bias, cudnn_handle)
+
+        assert success, f"cuDNN failed: {msg}"
+
+        passed, _, compare_msg = compare_results(C, C_expected, config)
+        print(format_test_result(passed, compare_msg))
+        assert passed
+    finally:
+        del A, B, C
+        if bias is not None:
+            del bias
+        if C_expected is not None:
+            del C_expected
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.L0
+def test_matmul_basic_bf16(cudnn_handle):
+    """Basic BF16 matmul sanity test."""
+    if cudnn_handle is None:
+        pytest.skip("cuDNN handle not available")
+
+    config = MatmulConfig(
+        batch=4, M=128, N=256, K=512,
+        a_dtype=torch.bfloat16, b_dtype=torch.bfloat16, c_dtype=torch.bfloat16,
+        compute_dtype=cudnn.data_type.FLOAT,
+        a_layout=LayoutType.ROW_MAJOR_PACKED, b_layout=LayoutType.ROW_MAJOR_PACKED, c_layout=LayoutType.ROW_MAJOR_PACKED,
+        a_transposed=False, b_transposed=False,
+        epilogue=EpilogueType.NONE,
+        rng_seed=54321,
+    )
+
+    rng = random.Random(config.rng_seed)
+    A, B, C, bias = create_tensors(config, rng)
+    C_expected = None
+
+    try:
+        print(format_test_header(1, 1, config))
+
+        C_expected = compute_reference(config, A, B, bias)
+        success, msg = run_cudnn_matmul(config, A, B, C, bias, cudnn_handle)
+
+        assert success, f"cuDNN failed: {msg}"
+
+        passed, _, compare_msg = compare_results(C, C_expected, config)
+        print(format_test_result(passed, compare_msg))
+        assert passed
+    finally:
+        del A, B, C
+        if bias is not None:
+            del bias
+        if C_expected is not None:
+            del C_expected
+        torch.cuda.empty_cache()
+
+
+@pytest.mark.L0
+def test_matmul_with_bias(cudnn_handle):
+    """Matmul with bias epilogue test."""
+    if cudnn_handle is None:
+        pytest.skip("cuDNN handle not available")
+
+    config = MatmulConfig(
+        batch=1, M=256, N=512, K=128,
+        a_dtype=torch.float16, b_dtype=torch.float16, c_dtype=torch.float16,
+        compute_dtype=cudnn.data_type.FLOAT,
+        a_layout=LayoutType.ROW_MAJOR_PACKED, b_layout=LayoutType.ROW_MAJOR_PACKED, c_layout=LayoutType.ROW_MAJOR_PACKED,
+        a_transposed=False, b_transposed=False,
+        epilogue=EpilogueType.BIAS,
+        rng_seed=98765,
+    )
+
+    rng = random.Random(config.rng_seed)
+    A, B, C, bias = create_tensors(config, rng)
+    C_expected = None
+
+    try:
+        print(format_test_header(1, 1, config))
+
+        C_expected = compute_reference(config, A, B, bias)
+        success, msg = run_cudnn_matmul(config, A, B, C, bias, cudnn_handle)
+
+        assert success, f"cuDNN failed: {msg}"
+
+        passed, _, compare_msg = compare_results(C, C_expected, config)
+        print(format_test_result(passed, compare_msg))
+        assert passed
+    finally:
+        del A, B, C
+        if bias is not None:
+            del bias
+        if C_expected is not None:
+            del C_expected
+        torch.cuda.empty_cache()
diff --git a/third_party/cudnn-frontend/test/python/test_mhas.py b/third_party/cudnn-frontend/test/python/test_mhas.py
new file mode 100644
index 00000000..0e13f69a
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_mhas.py
@@ -0,0 +1,1542 @@
+"""
+This test harness allows for testing the various options of the attention operator. See example usage under "main" below.
+
+The full documentation on the attention operator can be found in: https://docs.nvidia.com/deeplearning/cudnn/frontend/latest/operations/Attention.html
+
+Notebooks that demonstrate the attention operator can be found here:
+- Introductory example: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/python/50_scaled_dot_product_attention.ipynb
+- Example with paged caches: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/python/samples/python/52_scaled_dot_product_attention_with_paged_caches.ipynb
+"""
+
+import cudnn
+import pytest
+import torch
+import math
+from looseversion import LooseVersion
+
+import random
+import os
+
+from test_utils import torch_fork_set_rng
+
+input_type_options = [torch.float16, torch.bfloat16]
+layout_options = ["bshd_bshd_bshd", "bs3hd", "sbh3d"]
+head_group_options = ["multi_head", "group_query", "multi_query"]
+bias_options = [False, True]
+alibi_mask_options = [False, True]
+padding_mask_options = [False, True]
+causal_mask_options = [False, True]
+causal_mask_bottom_right_options = [False, True]
+diagonal_alignment_options = [
+    cudnn.diagonal_alignment.TOP_LEFT,
+    cudnn.diagonal_alignment.BOTTOM_RIGHT,
+]
+left_bound_options = [False, True]
+right_bound_options = [False, True]
+dropout_options = [False, True]
+ragged_options = [False, True]
+is_infer_options = [False, True]
+page_table_options = [0, 1, "_packed"]
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+def compute_ref(
+    q,
+    k,
+    v,
+    attn_scale=1.0,
+    bias=None,
+    is_alibi=False,
+    padding=None,
+    sliding_window=(None, None),
+    diagonal_alignment=cudnn.diagonal_alignment.TOP_LEFT,
+    dropout_prob=0.0,
+    dropout_mask=None,
+    generate_stats=False,
+    device="cuda",
+):
+    b, h_q, s_q, d_qk = q.shape
+    _, h_k, s_kv, _ = k.shape
+    _, h_v, _, d_v = v.shape
+
+    assert k.shape == (b, h_k, s_kv, d_qk)
+    assert v.shape == (b, h_v, s_kv, d_v)
+
+    # use float32 datatype and math for reference computation
+    q = q.to(dtype=torch.float32, device=device)
+    k = k.to(dtype=torch.float32, device=device)
+    v = v.to(dtype=torch.float32, device=device)
+
+    assert isinstance(sliding_window, tuple) and len(sliding_window) == 2
+
+    # Set/override left and right bounds explicitly
+    left_bound = sliding_window[0]
+    right_bound = sliding_window[1]
+
+    # expand tensors for GQA and MQA
+    if h_q != h_k:
+        assert h_q % h_k == 0
+        k = k.unsqueeze(2)
+        k = k.expand(-1, -1, h_q // h_k, -1, -1)
+        k = k.reshape(k.size(0), -1, k.size(3), k.size(4))
+    if h_q != h_v:
+        assert h_q % h_v == 0
+        v = v.unsqueeze(2)
+        v = v.expand(-1, -1, h_q // h_v, -1, -1)
+        v = v.reshape(v.size(0), -1, v.size(3), v.size(4))
+
+    if left_bound is not None:
+        swa_mask_zero = torch.ones(1, 1, s_q, 1, dtype=torch.bool, device=device)
+        swa_mask_zero[:, :, s_kv + left_bound - 1 :, :] = False
+        q = q * swa_mask_zero
+    # generate masks to compute reference values for padding mask
+    # (also called variable sequence length)
+    if padding is not None:
+        q_mask = torch.zeros(b, 1, s_q, 1, dtype=torch.bool, device=device)
+        k_mask = torch.zeros(b, 1, s_kv, 1, dtype=torch.bool, device=device)
+        v_mask = torch.zeros(b, 1, s_kv, 1, dtype=torch.bool, device=device)
+        s_mask = torch.zeros(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+        p_mask = torch.zeros(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+        seq_len_q, seq_len_kv = padding
+        for i, (m, n) in enumerate(zip(seq_len_q, seq_len_kv)):
+            q_mask[i, :, m:, :] = True
+            k_mask[i, :, n:, :] = True
+            v_mask[i, :, n:, :] = True
+            s_mask[i, :, :, n:] = True
+            p_mask[i, :, m:, :] = True
+
+        q = q.masked_fill(q_mask, 0.0)
+        k = k.masked_fill(k_mask, 0.0)
+        v = v.masked_fill(v_mask, 0.0)
+
+    s = torch.einsum("bhqd,bhkd->bhqk", q, k) * attn_scale
+
+    # Attention masks are applied in the following order:
+    # - Bias mask
+    # - Alibi mask
+    # - Padding mask
+    # - Causal mask
+    if bias is not None:
+        s = s + bias
+    if is_alibi:
+        index_row = torch.arange(s_q, dtype=torch.float32, device=device).view(-1, 1)
+        index_col = torch.arange(s_kv, dtype=torch.float32, device=device)
+        distance = index_col - index_row
+
+        # Get the closest power of 2 to `n_heads`.
+        # If `n_heads` is not a power of 2, then we first calculate slopes to the closest (smaller) power of 2,
+        # and then add the remaining slopes.
+        n = 2 ** math.floor(math.log2(h_q))
+        m_0 = 2.0 ** (-8.0 / n)
+        m = torch.pow(m_0, torch.arange(1, 1 + n))
+
+        # If `n_heads` is not a power of 2, then we add the remaining slopes.
+        # We calculate the remaining slopes for $n * 2$ (avoiding slopes added previously).
+        # And pick the slopes upto `n_heads`.
+        if n < h_q:
+            m_hat_0 = 2.0 ** (-4.0 / n)
+            m_hat = torch.pow(m_hat_0, torch.arange(1, 1 + 2 * (h_q - n), 2))
+            # Concatenate the slopes with the remaining slopes.
+            m = torch.cat([m, m_hat])
+
+        # Reshape the tensor to [1, num_heads, 1, 1]
+        m = m.view(1, -1, 1, 1).to(device=device)
+
+        alibi_mask = distance.to(dtype=torch.float32) * m
+        s = s + alibi_mask
+    if padding is not None:
+        s = s.masked_fill(s_mask, float("-inf"))
+    if diagonal_alignment == diagonal_alignment.TOP_LEFT and right_bound is not None:
+        causal_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+        causal_mask.triu_(diagonal=1 + right_bound)
+        s = s.masked_fill(causal_mask, float("-inf"))
+    elif diagonal_alignment == diagonal_alignment.BOTTOM_RIGHT and right_bound is not None:
+        causal_mask_bottom_right = None
+        if padding:
+            causal_mask_bottom_right = torch.ones(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+            seq_len_q, seq_len_kv = padding
+            for i in range(b):
+                causal_mask_bottom_right[i, :, :, :].triu_(diagonal=seq_len_kv[i] - seq_len_q[i] + 1 + right_bound)
+        else:
+            causal_mask_bottom_right = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+            causal_mask_bottom_right.triu_(diagonal=s_kv - s_q + 1 + right_bound)
+        s = s.masked_fill(causal_mask_bottom_right, float("-inf"))
+    if left_bound is not None:
+        assert diagonal_alignment is not None
+        if diagonal_alignment == diagonal_alignment.TOP_LEFT:
+            swa_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+            swa_mask.tril_(diagonal=-1 * left_bound)
+
+        elif diagonal_alignment == diagonal_alignment.BOTTOM_RIGHT:
+            # BRCM + SWA for variable sequence lengths
+            if padding:
+                swa_mask = torch.ones(b, 1, s_q, s_kv, dtype=torch.bool, device=device)
+                seq_len_q, seq_len_kv = padding
+                for i in range(b):
+                    swa_mask[i, :, :, :].tril_(diagonal=seq_len_kv[i] - seq_len_q[i] - left_bound)
+            # BRCM + SWA for fixed sequence lengths
+            else:
+                swa_mask = torch.ones(s_q, s_kv, dtype=torch.bool, device=device)
+                swa_mask.tril_(diagonal=-1 * left_bound + (s_kv - s_q))
+
+        swa_mask &= swa_mask_zero.view(s_q, 1)
+        s = s.masked_fill(swa_mask, float("-inf"))
+
+    p = torch.softmax(s, dim=-1)
+
+    if left_bound is not None:
+        p = p * swa_mask_zero
+    if padding is not None:
+        p = p.masked_fill(p_mask, 0.0)
+
+    # apply dropout mask over softmax outputs
+    if dropout_prob != 0.0:
+        assert dropout_mask != None, "PyTorch reference must have dropout_mask for dropout"
+        p = (p * dropout_mask) / (1 - dropout_prob)
+
+    o = torch.einsum("bhqk,bhkd->bhqd", p, v)
+
+    # softmax stats is used for backwards computation
+    if generate_stats:
+        # amax (NOT absolute max) is used here to evenly distribute gradient
+        row_max = torch.amax(s, -1, True)
+        row_exp = torch.exp(s - row_max)
+        row_sum = torch.sum(row_exp, -1, True)
+        stats = row_max + torch.log(row_sum)
+        return o, stats
+
+    return o
+
+
+# Generator for layout combinations
+# | layout          | GQA             | Packed      | GQA and Packed |
+# |-----------------|-----------------|-------------|----------------|
+# | bshd_bshd_bshd  | bshd_bshd_bshd  | thd_thd_thd | thd_thd_thd    |
+# | bs3hd           | bshd_bs2hd      | t3hd        | thd_t2hd       |
+# | sbh3d           | sbhd_sbh2d      |             |                |
+def generate_layout(
+    layout,
+    head_group,
+    shape_q,
+    shape_k,
+    shape_v,
+    shape_o,
+    is_packed=False,
+    seq_len_q=None,
+    seq_len_kv=None,
+):
+    b, h_q, s_q, d_qk = shape_q
+    b, h_k, s_kv, d_qk = shape_k
+    b, h_v, s_kv, d_v = shape_v
+    b, h_q, s_q, d_v = shape_o
+
+    assert shape_q == (b, h_q, s_q, d_qk)
+    assert shape_k == (b, h_k, s_kv, d_qk)
+    assert shape_v == (b, h_v, s_kv, d_v)
+    assert shape_o == (b, h_q, s_q, d_v)
+
+    if layout == "bshd_bshd_bshd":
+        if not is_packed:
+            # bshd_bshd_bshd
+            stride_q = (s_q * h_q * d_qk, d_qk, h_q * d_qk, 1)
+            stride_k = (s_kv * h_k * d_qk, d_qk, h_k * d_qk, 1)
+            stride_v = (s_kv * h_v * d_v, d_v, h_v * d_v, 1)
+            stride_o = (s_q * h_q * d_v, d_v, h_q * d_v, 1)
+            offset_q = 0
+            offset_k = offset_q + b * s_q * h_q * d_qk
+            offset_v = offset_k + b * s_kv * h_k * d_qk
+        else:
+            # thd_thd_thd
+            assert seq_len_q is not None
+            assert seq_len_kv is not None
+            t_q = torch.sum(seq_len_q)
+            t_kv = torch.sum(seq_len_kv)
+            stride_q = (s_q * h_q * d_qk, d_qk, h_q * d_qk, 1)
+            stride_k = (s_kv * h_k * d_qk, d_qk, h_k * d_qk, 1)
+            stride_v = (s_kv * h_v * d_v, d_v, h_v * d_v, 1)
+            stride_o = (s_q * h_q * d_v, d_v, h_q * d_v, 1)
+            offset_q = 0
+            offset_k = offset_q + t_q * h_q * d_qk
+            offset_v = offset_k + t_kv * h_k * d_qk
+    elif layout == "bs3hd":
+        if not is_packed:
+            if head_group == "multi_head":
+                # bs3hd
+                assert (h_q == h_k == h_v) and (s_q == s_kv) and (d_qk == d_v)
+                h, s, d = h_q, s_q, d_qk
+                stride_q = (s * 3 * h * d, d, 3 * h * d, 1)
+                stride_k = (s * 3 * h * d, d, 3 * h * d, 1)
+                stride_v = (s * 3 * h * d, d, 3 * h * d, 1)
+                stride_o = (s * h * d, d, h * d, 1)
+                offset_q = 0
+                offset_k = offset_q + h * d
+                offset_v = offset_k + h * d
+            else:
+                # bshd_bs2hd
+                assert (h_k == h_v) and (s_q == s_kv) and (d_qk == d_v)
+                h_kv, s, d = h_k, s_q, d_qk
+                stride_q = (s * h_q * d, d, h_q * d, 1)
+                stride_k = (s * 2 * h_kv * d, d, 2 * h_kv * d, 1)
+                stride_v = (s * 2 * h_kv * d, d, 2 * h_kv * d, 1)
+                stride_o = (s * h_q * d, d, h_q * d, 1)
+                offset_q = 0
+                offset_k = offset_q + s * b * h_q * d
+                offset_v = offset_k + h_kv * d
+        else:  # is_packed
+            assert seq_len_q is not None
+            assert seq_len_kv is not None
+            t_q = torch.sum(seq_len_q)
+            t_kv = torch.sum(seq_len_kv)
+            if head_group == "multi_head":
+                # t3hd
+                assert (h_q == h_k == h_v) and (s_q == s_kv) and (d_qk == d_v)
+                h, s, d = h_q, s_q, d_qk
+                stride_q = (s * 3 * h * d, d, 3 * h * d, 1)
+                stride_k = (s * 3 * h * d, d, 3 * h * d, 1)
+                stride_v = (s * 3 * h * d, d, 3 * h * d, 1)
+                stride_o = (s * h * d, d, h * d, 1)
+                offset_q = 0
+                offset_k = offset_q + h * d
+                offset_v = offset_k + h * d
+            else:
+                # thd_t2hd
+                assert (h_k == h_v) and (s_q == s_kv) and (d_qk == d_v)
+                h_kv, s, d = h_k, s_q, d_qk
+                stride_q = (s * h_q * d, d, h_q * d, 1)
+                stride_k = (s * 2 * h_kv * d, d, 2 * h_kv * d, 1)
+                stride_v = (s * 2 * h_kv * d, d, 2 * h_kv * d, 1)
+                stride_o = (s * h_q * d, d, h_q * d, 1)
+                offset_q = 0
+                offset_k = offset_q + t_q * h_q * d
+                offset_v = offset_k + h_kv * d
+    elif layout == "sbh3d":
+        if head_group == "multi_head":
+            # sbh3d
+            assert (h_q == h_k == h_v) and (s_q == s_kv) and (d_qk == d_v)
+            h, s, d = h_q, s_q, d_qk
+            stride_q = (h * 3 * d, 3 * d, b * h * 3 * d, 1)
+            stride_k = (h * 3 * d, 3 * d, b * h * 3 * d, 1)
+            stride_v = (h * 3 * d, 3 * d, b * h * 3 * d, 1)
+            stride_o = (h * d, d, b * h * d, 1)
+            offset_q = 0
+            offset_k = offset_q + d
+            offset_v = offset_k + d
+        else:
+            # sbhd_sbh2d
+            assert (h_k == h_v) and (s_q == s_kv) and (d_qk == d_v)
+            h_kv, s, d = h_k, s_q, d_qk
+            stride_q = (h_q * d, d, b * h_q * d, 1)
+            stride_k = (h_kv * 2 * d, 2 * d, b * h_kv * 2 * d, 1)
+            stride_v = (h_kv * 2 * d, 2 * d, b * h_kv * 2 * d, 1)
+            stride_o = (h_q * d, d, b * h_q * d, 1)
+            offset_q = 0
+            offset_k = offset_q + s * b * h_q * d
+            offset_v = offset_k + d
+    else:
+        raise ValueError("layout must be 'bshd_bshd_bshd', 'bs3hd', or 'sbh3d'")
+
+    return stride_q, stride_k, stride_v, stride_o, offset_q, offset_k, offset_v
+
+
+def generate_ragged_offset(
+    layout,
+    head_group,
+    shape_q,
+    shape_k,
+    shape_v,
+    shape_o,
+    seq_len_q,
+    seq_len_kv,
+    cudnn_version,
+):
+    b, h_q, s_q, d_qk = shape_q
+    b, h_k, s_kv, d_qk = shape_k
+    b, h_v, s_kv, d_v = shape_v
+    b, h_q, s_q, d_v = shape_o
+
+    assert shape_q == (b, h_q, s_q, d_qk)
+    assert shape_k == (b, h_k, s_kv, d_qk)
+    assert shape_v == (b, h_v, s_kv, d_v)
+    assert shape_o == (b, h_q, s_q, d_v)
+
+    # Compute the exclusive prefix sum for ragged sequence dimension
+    # tensor has shape (B, 1, 1, 1)
+    # output has shape (B+1, 1, 1, 1)
+    # ex) tensor = [[[[2, 4, 1, 6]]]]
+    #     output = [[[[0, 2, 6, 7, 13]]]]
+    def compute_exclusive_prefix_sum(tensor):
+        assert tensor.size(1) == tensor.size(2) == tensor.size(3) == 1
+        return torch.cat(
+            (
+                torch.zeros(1, 1, 1, 1, dtype=tensor.dtype, device=tensor.device),
+                torch.cumsum(tensor, dim=0),
+            )
+        )
+
+    if layout == "bshd_bshd_bshd":
+        # thd_thd_thd
+        q_ragged_offset = compute_exclusive_prefix_sum(seq_len_q) * h_q * d_qk
+        k_ragged_offset = compute_exclusive_prefix_sum(seq_len_kv) * h_k * d_qk
+        v_ragged_offset = compute_exclusive_prefix_sum(seq_len_kv) * h_v * d_v
+        o_ragged_offset = compute_exclusive_prefix_sum(seq_len_q) * h_q * d_v
+    elif layout == "bs3hd":
+        if head_group == "multi_head":
+            # t3hd
+            assert torch.equal(seq_len_q, seq_len_kv)
+            assert (h_q == h_k == h_v) and (d_qk == d_v)
+            seq_len, h, d = seq_len_q, h_q, d_qk
+            q_ragged_offset = compute_exclusive_prefix_sum(seq_len) * 3 * h * d
+            k_ragged_offset = compute_exclusive_prefix_sum(seq_len) * 3 * h * d
+            v_ragged_offset = compute_exclusive_prefix_sum(seq_len) * 3 * h * d
+            o_ragged_offset = compute_exclusive_prefix_sum(seq_len) * h * d
+        else:
+            # thd_t2hd
+            assert (h_k == h_v) and (d_qk == d_v)
+            seq_len, h_kv, d = seq_len_q, h_k, d_qk
+            q_ragged_offset = compute_exclusive_prefix_sum(seq_len_q) * h_q * d
+            k_ragged_offset = compute_exclusive_prefix_sum(seq_len_kv) * 2 * h_kv * d
+            v_ragged_offset = compute_exclusive_prefix_sum(seq_len_kv) * 2 * h_kv * d
+            o_ragged_offset = compute_exclusive_prefix_sum(seq_len_q) * h_q * d
+    else:  # sbh3d
+        raise ValueError()
+
+    q_ragged_offset = q_ragged_offset.to(dtype=torch.int64 if cudnn_version >= "9.6.0" else torch.int32)
+    k_ragged_offset = k_ragged_offset.to(dtype=torch.int64 if cudnn_version >= "9.6.0" else torch.int32)
+    v_ragged_offset = v_ragged_offset.to(dtype=torch.int64 if cudnn_version >= "9.6.0" else torch.int32)
+    o_ragged_offset = o_ragged_offset.to(dtype=torch.int64 if cudnn_version >= "9.6.0" else torch.int32)
+
+    return q_ragged_offset, k_ragged_offset, v_ragged_offset, o_ragged_offset
+
+
+# @brief Convert a padded page table into a packed page table
+# @return packed_page_table: packed page table
+# @return ragged_offset: offset into the packed page table
+def convert_uniform_to_ragged_page_tables(uniform_tensor, seq_len, block_size, cudnn_version):
+    [B, H, S, D] = uniform_tensor.size()
+    ragged_offset = torch.zeros(B + 1, 1, 1, 1, dtype=torch.int32, device=uniform_tensor.device)  # Initialize with first offset as 0
+    for i in range(1, B + 1):
+        prev_seq_len = seq_len[i - 1]
+        num_pages_prev_batch = (prev_seq_len + block_size - 1) // block_size
+        next_batch_offset = ragged_offset[i - 1] + num_pages_prev_batch
+        ragged_offset[i, 0, 0, 0] = next_batch_offset
+
+    ragged_offset.to(dtype=torch.int64 if cudnn_version >= "9.6.0" else torch.int32)
+    # ragged_offset.to(dtype=torch.int32)
+
+    packed_page_table = torch.zeros(B * S, H, D).to(dtype=uniform_tensor.dtype, device=uniform_tensor.device)
+
+    uniform_tensor_thd = torch.einsum("bhsd->bshd", uniform_tensor).reshape(B * S, H, D)
+
+    t_0 = 0
+    for b, t_1 in enumerate(ragged_offset.flatten()[1:]):
+        packed_page_table[t_0:t_1, :, :] = uniform_tensor_thd[b * S : b * S + (t_1 - t_0), :, :]
+        t_0 = t_1
+
+    packed_page_table = packed_page_table.reshape(B, S, H, D)
+    packed_page_table = torch.einsum("bshd->bhsd", packed_page_table)
+
+    return packed_page_table, ragged_offset
+
+
+def convert_ragged_to_uniform(ragged_tensor, seq_len):
+    # limitations:
+    # 1. tensor is bhsd dim order and bshd stride order (may be interleaved)
+    # 2. ragged tensor is packed and in-order, therefore
+    #    ragged offset is monatomically increasing
+    assert ragged_tensor.dim() == 4
+    b, h, s, d = ragged_tensor.size()
+    b_stride, h_stride, s_stride, d_stride = ragged_tensor.stride()
+    assert b_stride >= s_stride >= h_stride >= d_stride
+    assert seq_len.dim() == 4 and (b, 1, 1, 1) == seq_len.size()
+
+    # ragged offset is given in 4D, convert to 1D locally
+    seq_len = seq_len.flatten()
+
+    # convert bhsd to bshd and flatten
+    uniform_tensor = torch.zeros(b, s, h, d).to(dtype=ragged_tensor.dtype, device=ragged_tensor.device)
+    ragged_tensor_thd = torch.einsum("bhsd->bshd", ragged_tensor).reshape(b * s, h, d)
+
+    # copy
+    t = 0
+    for b, s in enumerate(seq_len):
+        uniform_tensor[b, 0:s, :, :] = ragged_tensor_thd[t : t + s, :, :]
+        t += s
+
+    # convert back to bshd to bhsd
+    uniform_tensor = torch.einsum("bshd->bhsd", uniform_tensor)
+    return uniform_tensor
+
+
+def generate_actual_seq_lens(b, s_q, s_kv, layout, head_group, is_padding, force_sq_less_or_equal_than_skv):
+    seq_len_q_gpu = None
+    seq_len_kv_gpu = None
+
+    if is_padding:
+        seq_len_q_gpu = torch.randint(1, s_q + 1, (b, 1, 1, 1), dtype=torch.int32, device="cuda")
+
+        if not (layout == "bs3hd" and head_group == "multi_head"):
+            seq_len_kv_gpu = torch.randint(1, s_kv + 1, (b, 1, 1, 1), dtype=torch.int32, device="cuda")
+            # Avoid seq_len_q > seq_len_kv (known limitation):
+            if force_sq_less_or_equal_than_skv:
+                seq_len_q_gpu = torch.max(torch.tensor(1), seq_len_q_gpu % seq_len_kv_gpu)
+        else:
+            seq_len_kv_gpu = seq_len_q_gpu
+
+    return (seq_len_q_gpu, seq_len_kv_gpu)
+
+
+# fmt: off
+@pytest.mark.parametrize("is_infer", is_infer_options, ids=lambda p: f"infer{int(p)}")
+@pytest.mark.parametrize("is_ragged", ragged_options, ids=lambda p: f"ragged{int(p)}")
+@pytest.mark.parametrize("is_dropout", dropout_options, ids=lambda p: f"dropout{int(p)}")
+@pytest.mark.parametrize("is_right_bound", right_bound_options, ids=lambda p: f"right_bound{int(p)}")
+@pytest.mark.parametrize("is_left_bound", left_bound_options, ids=lambda p: f"left_bound{int(p)}")
+@pytest.mark.parametrize("diagonal_alignment", diagonal_alignment_options, ids=lambda p: f"diagonal_alignment_{p}".replace("diagonal_alignment.",""))
+@pytest.mark.parametrize("is_padding", padding_mask_options, ids=lambda p: f"padding{int(p)}")
+@pytest.mark.parametrize("is_alibi", alibi_mask_options, ids=lambda p: f"alibi{int(p)}")
+@pytest.mark.parametrize("is_bias", bias_options, ids=lambda p: f"bias{int(p)}")
+@pytest.mark.parametrize("paged_attention", page_table_options, ids=lambda p: f"paged{p}")
+@pytest.mark.parametrize("head_group", head_group_options)
+@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("input_type", input_type_options, ids=lambda p: str(p))
+@pytest.mark.L0
+# fmt: on
+@torch_fork_set_rng(seed=0)
+def test_sdpa(
+    input_type,
+    layout,
+    head_group,
+    paged_attention,
+    is_bias,
+    is_alibi,
+    is_padding,
+    diagonal_alignment,
+    is_left_bound,
+    is_right_bound,
+    is_dropout,
+    is_ragged,
+    is_infer,
+    request,
+    cudnn_handle
+):
+    is_paged_attention = (paged_attention == "_packed" or paged_attention == 1)
+    is_packed_paged_attention = (paged_attention == "_packed")
+
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+
+    if cudnn_version < "8.9.3":
+        pytest.skip("SDPA fprop requires cudnn 8.9.3 or higher")
+
+    if head_group != "multi_head" and cudnn_version < "8.9.7":
+        pytest.skip("GQA and MQA is only supported 8.9.7 onwards.")
+
+    if is_alibi and cudnn_version < "8.9.4":
+        pytest.skip("ALiBi mask is only supported 8.9.4 onwards.")
+
+    if is_padding and cudnn_version < "8.9.3":
+        pytest.skip("Padding mask is only supported 8.9.3 onwards.")
+
+    if is_dropout and cudnn_version < "8.9.6":
+        pytest.skip("Dropout reference is only supported on 8.9.6 onwards.")
+
+    if is_ragged and cudnn_version < "9":
+        pytest.skip("Ragged tensor is only supported 9.0.0 onwards")
+
+    if is_ragged and layout == "bs3hd" and cudnn_version < "9.1.0":
+        pytest.skip("t3hd is only supported on 9.1.0 onwards")
+
+    if is_ragged and torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("Ragged tensor is only supported hopper")
+
+    if is_ragged and not (layout == "bshd_bshd_bshd" or layout == "bs3hd"):
+        pytest.skip("Ragged tensor is only tested with thd_thd_thd and t3hd")
+
+    if is_ragged and not is_padding:
+        pytest.skip("Ragged tensor is only tested with packed variable length tensors")
+
+    if is_paged_attention and (not is_padding or cudnn_version < "9.5" or not layout == "bshd_bshd_bshd" or is_ragged):
+        pytest.skip("Paged attention is only tested with packed variable length tensors, bshd_bshd_bshd, no ragged offsets, and only on cuDNNv9.5 or greater")
+
+    if is_packed_paged_attention and cudnn_version < "9.10.2":
+        pytest.skip("Packed paged attention is only supported on cuDNNv9.10.2 or greater")
+
+    # -------------------------- default randomized parameter testing ------------------------
+    # batch size
+    b = 2
+    # query sequence length
+
+    s_q_choices = [ 24, 256, 512, 1024, 2048]
+
+    if cudnn_version >= "9.7.0" and layout == "bshd_bshd_bshd":
+        s_q_choices.extend([1])
+
+    s_q = random.choice(s_q_choices)
+
+    if s_q == 1:
+        is_left_bound = False
+        request.config.option.left_bound = None
+        
+    # key+value sequence length
+    s_kv = (
+        random.choice([24, 32, 256, 512, 1024, 2048])
+        if layout == "bshd_bshd_bshd"
+        else s_q
+    )
+
+    d_choices = [32, 56, 64, 128]
+
+    # for hopper always extend d_choices to 256
+    if cudnn_version >= "9.5":
+        d_choices.extend([192, 256])
+
+    if cudnn_version >= "9.9.0":
+        d_choices.extend([200, 264, 520])
+
+    # query+key embedding dimension per head
+    d_qk = random.choice(d_choices)
+    # value embedding dimension per head
+    d_v = (
+        random.choice(d_choices)
+        if (layout == "bshd_bshd_bshd")
+        else d_qk
+    )
+    # number of heads
+    h_q = 6
+    if head_group == "multi_head":
+        h_k = 6
+        h_v = 6
+    elif head_group == "group_query":
+        h_k = random.choice([6, 3, 2, 1])
+        h_v = random.choice([6, 3, 2, 1]) if layout == "bshd_bshd_bshd" else h_k
+    elif head_group == "multi_query":
+        h_k = 1
+        h_v = 1
+    else:
+        assert False, "Head group must be either MHA, GQA, or MQA"
+
+    # block size for paged attention
+    block_size = random.choice([32, 64, 128])
+
+    # Left/right bound should only be specified if we requested to set a left_bound/right_bound
+    assert (request.config.option.left_bound is None) or is_left_bound
+    assert (request.config.option.right_bound is None) or is_right_bound
+
+    # If bounds are requested: randomly pick between 1/0 and s_kv/4
+    left_bound = max(1, random.choice([1, s_kv//4])) if is_left_bound else None
+    right_bound = random.choice([0, s_kv//4]) if is_right_bound else None
+
+    implementation = cudnn.attention_implementation.AUTO
+
+    # -------------------------- override test parameters if args are provided ----------------
+    b = int(request.config.option.b) if request.config.option.b != None else b
+    s_q = int(request.config.option.s_q) if request.config.option.s_q != None else s_q
+    s_kv = int(request.config.option.s_kv) if request.config.option.s_kv != None else s_kv
+    d_qk = int(request.config.option.d_qk) if request.config.option.d_qk != None else d_qk
+    d_v = int(request.config.option.d_v) if request.config.option.d_v != None else d_v
+    h_q = int(request.config.option.h_q) if request.config.option.h_q != None else h_q
+    h_k = int(request.config.option.h_k) if request.config.option.h_k != None else h_k
+    h_v = int(request.config.option.h_v) if request.config.option.h_v != None else h_v
+    block_size = int(request.config.option.block_size) if request.config.option.block_size != None else block_size
+    left_bound = int(request.config.option.left_bound) if request.config.option.left_bound != None else left_bound
+    right_bound = int(request.config.option.right_bound) if request.config.option.right_bound != None else right_bound
+    implementation = getattr(cudnn.attention_implementation, request.config.option.implementation) if request.config.option.implementation != None else implementation
+
+    if s_q == 1:
+        is_dropout = False
+        request.config.option.dropout = None
+
+    if d_qk != d_v and cudnn_version < "8.9.6":
+        pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.")
+
+    if d_qk != d_v and is_ragged and cudnn_version < "9.1":
+        pytest.skip("d_qk != d_v is not supported with ragged offset")
+
+    print("\n=============== TEST CMD TO REPRODUCE ===============")
+    cmd = f"pytest {request.node.nodeid} --b={b} --s_q={s_q} --s_kv={s_kv} --d_qk={d_qk} --d_v={d_v} --h_q={h_q} --h_k={h_k} --h_v={h_v} --block_size={block_size} --implementation={implementation.name}"
+    if left_bound is not None:
+        cmd += f" --left_bound={left_bound}"
+    if right_bound is not None:
+        cmd += f" --right_bound={right_bound}"
+
+    print(cmd)
+    print("=====================================================")
+
+    attn_scale = 0.125
+    dropout_prob = 0.1 if is_dropout else 0.0
+
+    shape_q = (b, h_q, s_q, d_qk)
+    shape_k = (b, h_k, s_kv, d_qk)
+    shape_v = (b, h_v, s_kv, d_v)
+    shape_o = (b, h_q, s_q, d_v)
+
+    qkv_num_elems = math.prod(shape_q) + math.prod(shape_k) + math.prod(shape_v)
+
+    (stride_q, stride_k, stride_v, stride_o, offset_q, offset_k, offset_v) = (
+        generate_layout(
+            layout,
+            head_group,
+            shape_q,
+            shape_k,
+            shape_v,
+            shape_o,
+        )
+    )
+
+    qkv_gpu = torch.randn(qkv_num_elems, dtype=input_type, device="cuda") - 0.5
+    q_gpu = torch.as_strided(qkv_gpu, shape_q, stride_q, storage_offset=offset_q)
+    k_gpu = torch.as_strided(qkv_gpu, shape_k, stride_k, storage_offset=offset_k)
+    v_gpu = torch.as_strided(qkv_gpu, shape_v, stride_v, storage_offset=offset_v)
+
+    bias_gpu = (
+        torch.randn(1, h_q, s_q, s_kv, device="cuda", dtype=input_type)
+        if is_bias
+        else None
+    )
+
+    seq_len_q_gpu, seq_len_kv_gpu = generate_actual_seq_lens(b, s_q, s_kv, layout, head_group, is_padding, is_left_bound or (is_right_bound and diagonal_alignment == diagonal_alignment.BOTTOM_RIGHT))
+
+    if is_dropout:
+        seed_gpu = torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda")
+        offset_gpu = torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda")
+
+    rng_dump_gpu = (
+        torch.zeros((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda")
+        if is_dropout
+        else None
+    )
+
+    if is_ragged:
+        (
+            q_ragged_offset_gpu,
+            k_ragged_offset_gpu,
+            v_ragged_offset_gpu,
+            o_ragged_offset_gpu,
+        ) = generate_ragged_offset(
+            layout,
+            head_group,
+            shape_q,
+            shape_k,
+            shape_v,
+            shape_o,
+            seq_len_q_gpu,
+            seq_len_kv_gpu,
+            cudnn_version
+        )
+
+    o_gpu = torch.empty(
+        b * h_q * s_q * d_v, dtype=input_type, device="cuda"
+    ).as_strided(shape_o, stride_o)
+    stats_gpu = (
+        torch.empty(b, h_q, s_q, 1, dtype=torch.float32, device="cuda")
+        if not is_infer
+        else None
+    )
+
+    def create_container_and_page_table(tensor, block_size):
+        B, H, S, D = tensor.shape
+        # num_blocks = math.ceil(S/block_size) * B
+        blocks_per_batch = math.ceil(S/block_size)
+
+        padding_seq = (blocks_per_batch * block_size) - S
+        if padding_seq > 0:
+            zeros = torch.zeros(B,H,padding_seq,D, device='cuda', dtype=tensor.dtype)
+            cat_tensor = torch.cat((tensor, zeros), axis = 2)
+        else:
+            cat_tensor = tensor
+
+        reshaped = torch.cat((cat_tensor.clone()).chunk(blocks_per_batch, dim=2), dim=0)
+
+        table_size = math.ceil(S/block_size)
+        page_table_temp = torch.linspace(0, B*table_size-1, B*table_size, device='cuda', dtype=torch.int32).reshape(table_size,1,B,1)
+        page_table_temp = torch.transpose(page_table_temp,0,2)
+
+        alt_stride = (blocks_per_batch, blocks_per_batch, 1, 1)
+
+        # Make batch size outer dimension (required for SM100 kernel)
+        page_table_dims = (B, 1, blocks_per_batch, 1)
+        page_table = torch.randn(blocks_per_batch * B).int().cuda().as_strided(page_table_dims, alt_stride)
+        page_table.copy_(page_table_temp)
+
+        return(reshaped, page_table)
+
+
+    container_k_gpu = None
+    container_v_gpu = None
+    page_table_k_gpu = None
+    page_table_v_gpu = None
+
+    if is_paged_attention:
+        container_k_gpu, page_table_k_gpu = create_container_and_page_table(k_gpu, block_size)
+        container_v_gpu, page_table_v_gpu = create_container_and_page_table(v_gpu, block_size)
+
+    if is_packed_paged_attention:
+        page_table_k_gpu, page_table_k_ragged_offset_gpu = convert_uniform_to_ragged_page_tables(page_table_k_gpu, seq_len_kv_gpu, block_size, cudnn_version)
+        page_table_v_gpu, page_table_v_ragged_offset_gpu = convert_uniform_to_ragged_page_tables(page_table_v_gpu, seq_len_kv_gpu, block_size, cudnn_version)
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    sm_version = torch.cuda.get_device_capability()[0] * 10 + torch.cuda.get_device_capability()[1]
+    # cuDNN graph
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(input_type),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+        sm_version=sm_version
+    )
+
+    q = graph.tensor_like(q_gpu)
+    k = graph.tensor_like(k_gpu) if not is_paged_attention else graph.tensor_like(container_k_gpu)
+    v = graph.tensor_like(v_gpu) if not is_paged_attention else graph.tensor_like(container_v_gpu)
+
+    page_table_k = graph.tensor_like(page_table_k_gpu) if is_paged_attention else None
+    page_table_v = graph.tensor_like(page_table_v_gpu) if is_paged_attention else None
+
+    bias = graph.tensor_like(bias_gpu) if is_bias else None
+
+    seq_len_q = graph.tensor_like(seq_len_q_gpu) if is_padding else None
+    seq_len_kv = graph.tensor_like(seq_len_kv_gpu) if is_padding else None
+
+    if is_dropout:
+        seed = graph.tensor_like(seed_gpu)
+        offset = graph.tensor_like(offset_gpu)
+        dropout_tuple = (dropout_prob, seed, offset)
+
+    rng_dump = graph.tensor_like(rng_dump_gpu) if is_dropout else None
+
+    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None
+    k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None
+    v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None
+    o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None
+
+    page_table_k_ragged_offset = graph.tensor_like(page_table_k_ragged_offset_gpu) if is_packed_paged_attention else None
+    page_table_v_ragged_offset = graph.tensor_like(page_table_v_ragged_offset_gpu) if is_packed_paged_attention else None
+
+    if is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+
+    if is_packed_paged_attention:
+        page_table_k.set_ragged_offset(page_table_k_ragged_offset)
+        page_table_v.set_ragged_offset(page_table_v_ragged_offset)
+   
+    o, stats = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=k,
+        v=v,
+        generate_stats=not is_infer,
+        attn_scale=attn_scale,
+        bias=bias,
+        use_alibi_mask=is_alibi,
+        use_padding_mask=is_padding,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        diagonal_band_left_bound=left_bound, 
+        diagonal_band_right_bound=right_bound,
+        diagonal_alignment=diagonal_alignment,
+        dropout=dropout_tuple if is_dropout else None,
+        rng_dump=rng_dump,
+        paged_attention_k_table=page_table_k,
+        paged_attention_v_table=page_table_v,
+        paged_attention_max_seq_len_kv=s_kv if is_paged_attention else None,
+        implementation=implementation
+    )
+
+    o.set_output(True).set_dim(shape_o).set_stride(stride_o)
+    if is_ragged:
+        o.set_ragged_offset(o_ragged_offset)
+
+    if is_infer == False:
+        stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    try:
+        graph.validate()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print("Graph not supported")
+        pytest.skip(repr(e))
+    except Exception as e:
+        pytest.fail(repr(e))
+
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    #graph.create_execution_plans([cudnn.heur_mode.FALLBACK])
+    graph.check_support()
+    graph.build_plans()
+
+    variant_pack = {
+        q: q_gpu,
+        k: k_gpu if not is_paged_attention else container_k_gpu,
+        v: v_gpu if not is_paged_attention else container_v_gpu,
+        bias: bias_gpu,
+        seq_len_q: seq_len_q_gpu,
+        seq_len_kv: seq_len_kv_gpu,
+        q_ragged_offset: q_ragged_offset_gpu if is_ragged else None,
+        k_ragged_offset: k_ragged_offset_gpu if is_ragged else None,
+        v_ragged_offset: v_ragged_offset_gpu if is_ragged else None,
+        o_ragged_offset: o_ragged_offset_gpu if is_ragged else None,
+        o: o_gpu,
+        stats: stats_gpu,
+        rng_dump: rng_dump_gpu,
+        page_table_k: page_table_k_gpu,
+        page_table_v: page_table_v_gpu,
+        page_table_k_ragged_offset: page_table_k_ragged_offset_gpu if is_packed_paged_attention else None,
+        page_table_v_ragged_offset: page_table_v_ragged_offset_gpu if is_packed_paged_attention else None
+    }
+
+    if is_dropout:
+        variant_pack[seed] = seed_gpu
+        variant_pack[offset] = offset_gpu
+
+    workspace = torch.empty(
+        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+    )
+    graph.execute(variant_pack, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
+
+    # compare with torch autograd reference
+    q_ref = q_gpu.float()
+    k_ref = k_gpu.float()
+    v_ref = v_gpu.float()
+
+    if is_ragged:
+        q_ref = convert_ragged_to_uniform(q_ref, seq_len_q_gpu.detach())
+        k_ref = convert_ragged_to_uniform(k_ref, seq_len_kv_gpu.detach())
+        v_ref = convert_ragged_to_uniform(v_ref, seq_len_kv_gpu.detach())
+
+    if is_bias:
+        bias_ref = bias_gpu.float()
+
+    if is_padding:
+        seq_len_q_ref = seq_len_q_gpu.flatten()
+        seq_len_kv_ref = seq_len_kv_gpu.flatten()
+
+    if is_dropout:
+        rng_dump_ref = rng_dump_gpu.float()
+
+    ret = compute_ref(
+        q_ref,
+        k_ref,
+        v_ref,
+        attn_scale=attn_scale,
+        bias=bias_ref if is_bias else None,
+        is_alibi=is_alibi,
+        padding=(seq_len_q_ref, seq_len_kv_ref) if is_padding else None,
+        sliding_window=(left_bound, right_bound),
+        diagonal_alignment=diagonal_alignment,
+        generate_stats=(is_infer == False),
+        dropout_prob=dropout_prob,
+        dropout_mask=rng_dump_ref if is_dropout else None,
+    )
+    if is_infer == False:
+        o_ref, stats_ref = ret
+    else:
+        o_ref = ret
+
+    if is_ragged:
+        o_gpu = convert_ragged_to_uniform(o_gpu, seq_len_q_gpu.detach())
+
+    if is_padding:
+        # zero out padded region of the output for comparison
+        for i, m in enumerate(seq_len_q_ref):
+            o_ref[i, :, m:, :] = 0
+            o_gpu[i, :, m:, :] = 0
+            if is_infer == False:
+                stats_ref[i, :, m:, :] = 0
+                stats_gpu[i, :, m:, :] = 0
+
+    torch.testing.assert_close(o_ref, o_gpu, check_dtype=False, atol=2e-2, rtol=2e-2)
+    if is_infer == False:
+        torch.testing.assert_close(stats_ref, stats_gpu, atol=2e-2, rtol=2e-2)
+
+    
+    
+
+
+
+# fmt: off
+@pytest.mark.parametrize("is_ragged", ragged_options, ids=lambda p: f"ragged{int(p)}")
+@pytest.mark.parametrize("is_dropout", dropout_options, ids=lambda p: f"dropout{int(p)}")
+@pytest.mark.parametrize("is_right_bound", right_bound_options, ids=lambda p: f"right_bound{int(p)}")
+@pytest.mark.parametrize("is_left_bound", left_bound_options, ids=lambda p: f"left_bound{int(p)}")
+@pytest.mark.parametrize("diagonal_alignment", diagonal_alignment_options, ids=lambda p: f"diagonal_alignment_{p}".replace("diagonal_alignment.",""))
+@pytest.mark.parametrize("is_padding", padding_mask_options, ids=lambda p: f"padding{int(p)}")
+@pytest.mark.parametrize("is_alibi", alibi_mask_options, ids=lambda p: f"alibi{int(p)}")
+@pytest.mark.parametrize("is_bias", bias_options, ids=lambda p: f"bias{int(p)}")
+@pytest.mark.parametrize("head_group", head_group_options)
+@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("input_type", input_type_options, ids=lambda p: str(p))
+@pytest.mark.L0
+# fmt: on
+@torch_fork_set_rng(seed=0)
+def test_sdpa_backward(
+    input_type,
+    layout,
+    head_group,
+    is_bias,
+    is_alibi,
+    is_padding,
+    diagonal_alignment,
+    is_right_bound,
+    is_left_bound,
+    is_dropout,
+    is_ragged,
+    request,
+    cudnn_handle
+):
+
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+
+    if cudnn_version < "8.9.3":
+        pytest.skip("SDPA bprop requires cudnn 8.9.3 or higher")
+
+    if head_group != "multi_head" and cudnn_version < "8.9.7":
+        pytest.skip("GQA and MQA is only supported 8.9.7 onwards.")
+
+    if is_bias and cudnn_version < "8.9.6":
+        pytest.skip("dBias is only supported 8.9.6 onwards.")
+
+    if is_bias and cudnn_version < "9" and torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("dBias is only supported on hopper before v9.")
+
+    if is_alibi and cudnn_version < "8.9.4":
+        pytest.skip("ALiBi mask is only supported 8.9.4 onwards.")
+
+    if is_padding and cudnn_version < "8.9.3":
+        pytest.skip("Padding mask is only supported 8.9.3 onwards.")
+
+    if is_dropout and cudnn_version < "8.9.6":
+        pytest.skip("RNG dump is only supported on 8.9.6 onwards.")
+
+    if is_ragged and cudnn_version < "9":
+        pytest.skip("Ragged tensor is only supported 9.0.0 onwards")
+
+    if is_ragged and torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("Ragged tensor is only supported hopper")
+
+    if is_ragged and not (layout == "bshd_bshd_bshd" or layout == "bs3hd"):
+        pytest.skip("Ragged tensor is only tested with thd_thd_thd and t3hd")
+
+    if is_ragged and layout == "bs3hd" and cudnn_version < "9.1.0":
+        pytest.skip("t3hd is only supported on 9.1.0 onwards")
+
+    if is_ragged and not is_padding:
+        pytest.skip("Ragged tensor is only tested with packed variable length tensors")
+
+    # -------------------------- default randomized parameter testing ------------------------
+    # batch size
+    b = 2
+    # query sequence length
+    s_q = random.choice([8, 16, 24, 32, 256, 512, 1024])
+    # key+value sequence length
+    s_kv = (
+        random.choice([8, 16, 24, 32, 256, 512, 1024])
+        if layout == "bshd_bshd_bshd"
+        else s_q
+    )
+    # query+key embedding dimension per head
+    d_qk = random.choice([32, 56, 64, 128])
+    # value embedding dimension per head
+    d_v = (
+        random.choice([64, 96, 128])
+        if (layout == "bshd_bshd_bshd")
+        else d_qk
+    )
+    # number of heads
+    h_q = 6
+    if head_group == "multi_head":
+        h_k = 6
+        h_v = 6
+    elif head_group == "group_query":
+        h_k = random.choice([6, 3, 2, 1])
+        h_v = random.choice([6, 3, 2, 1]) if layout == "bshd_bshd_bshd" else h_k
+    elif head_group == "multi_query":
+        h_k = 1
+        h_v = 1
+    else:
+        assert False, "Head group must be either MHA, GQA, or MQA"
+
+    # test both deterministic and nondeterministic implementation
+    if cudnn_version < "9":
+        os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = "0"
+    is_deterministic = random.choice([True, False])
+
+    
+    # Left/right bound should only be specified if we requested to set a left_bound/right_bound
+    assert (request.config.option.left_bound is None) or is_left_bound
+    assert (request.config.option.right_bound is None) or is_right_bound
+
+    # If bounds are requested: randomly pick between 1/0 and s_kv/4
+    left_bound = max(1, random.choice([1, s_kv//4])) if is_left_bound else None
+    right_bound = random.choice([0, s_kv//4]) if is_right_bound else None
+
+    # -------------------------- override test parameters if args are provided ----------------
+    b = int(request.config.option.b) if request.config.option.b != None else b
+    s_q = int(request.config.option.s_q) if request.config.option.s_q != None else s_q
+    s_kv = int(request.config.option.s_kv) if request.config.option.s_kv != None else s_kv
+    d_qk = int(request.config.option.d_qk) if request.config.option.d_qk != None else d_qk
+    d_v = int(request.config.option.d_v) if request.config.option.d_v != None else d_v
+    h_q = int(request.config.option.h_q) if request.config.option.h_q != None else h_q
+    h_k = int(request.config.option.h_k) if request.config.option.h_k != None else h_k
+    h_v = int(request.config.option.h_v) if request.config.option.h_v != None else h_v
+    is_deterministic = (
+        bool(int(request.config.option.deterministic))
+        if request.config.option.deterministic != None
+        else is_deterministic
+    )
+    left_bound = int(request.config.option.left_bound) if request.config.option.left_bound != None else left_bound
+    right_bound = int(request.config.option.right_bound) if request.config.option.right_bound != None else right_bound
+
+    if d_qk != d_v and cudnn_version < "8.9.6":
+        pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.")
+
+    if d_qk != d_v and is_ragged and cudnn_version < "9.1":
+        pytest.skip("d_qk != d_v is not supported with ragged offset")
+
+    if is_alibi and not (diagonal_alignment == diagonal_alignment.TOP_LEFT and (right_bound is not None and right_bound == 0)):
+        pytest.skip("ALiBi mask is only supported with causal mask")
+
+    if (
+        is_deterministic
+        and cudnn_version < "9"
+        and torch.cuda.get_device_capability()[0] < 9
+    ):
+        pytest.skip("Ampere deterministic implementation is not supported below 9.0.0")
+
+    print("\n=============== TEST CMD TO REPRODUCE ===============")
+    cmd = f"pytest {request.node.nodeid} --b={b} --s_q={s_q} --s_kv={s_kv} --d_qk={d_qk} --d_v={d_v} --h_q={h_q} --h_k={h_k} --h_v={h_v} --deterministic={int(is_deterministic)}"
+    if left_bound is not None:
+        cmd += f" --left_bound={left_bound}"
+    if right_bound is not None:
+        cmd += f" --right_bound={right_bound}"
+    print(cmd)
+    print("=====================================================")
+
+    attn_scale = 0.125
+    dropout_prob = 0.1 if is_dropout else 0.0
+
+    shape_q = (b, h_q, s_q, d_qk)
+    shape_k = (b, h_k, s_kv, d_qk)
+    shape_v = (b, h_v, s_kv, d_v)
+    shape_o = (b, h_q, s_q, d_v)
+
+    qkv_num_elems = math.prod(shape_q) + math.prod(shape_k) + math.prod(shape_v)
+
+    (stride_q, stride_k, stride_v, stride_o, offset_q, offset_k, offset_v) = (
+        generate_layout(
+            layout,
+            head_group,
+            shape_q,
+            shape_k,
+            shape_v,
+            shape_o,
+        )
+    )
+
+    qkv_gpu = torch.randn(qkv_num_elems, dtype=input_type, device="cuda") - 0.5
+    q_gpu = torch.as_strided(qkv_gpu, shape_q, stride_q, storage_offset=offset_q)
+    k_gpu = torch.as_strided(qkv_gpu, shape_k, stride_k, storage_offset=offset_k)
+    v_gpu = torch.as_strided(qkv_gpu, shape_v, stride_v, storage_offset=offset_v)
+
+    dQKV_gpu = torch.empty(qkv_num_elems, dtype=input_type, device="cuda")
+    dQ_gpu = torch.as_strided(dQKV_gpu, shape_q, stride_q, storage_offset=offset_q)
+    dK_gpu = torch.as_strided(dQKV_gpu, shape_k, stride_k, storage_offset=offset_k)
+    dV_gpu = torch.as_strided(dQKV_gpu, shape_v, stride_v, storage_offset=offset_v)
+
+    dO_gpu = 0.1 * torch.randn(
+        b * h_q * s_q * d_v, dtype=input_type, device="cuda"
+    ).as_strided(shape_o, stride_o)
+
+    bias_gpu = (
+        torch.randn(1, h_q, s_q, s_kv, device="cuda", dtype=input_type)
+        if is_bias
+        else None
+    )
+    dBias_gpu = (
+        torch.randn(1, h_q, s_q, s_kv, device="cuda", dtype=input_type)
+        if is_bias
+        else None
+    )
+
+    seq_len_q_gpu, seq_len_kv_gpu = generate_actual_seq_lens(b, s_q, s_kv, layout, head_group, is_padding, is_left_bound or (is_right_bound and diagonal_alignment == diagonal_alignment.BOTTOM_RIGHT))
+
+    # maxT = next_multiple_of_64(sum(seq_len))
+    max_t_q = ((torch.sum(seq_len_q_gpu).item() + 63) // 64) * 64 if is_ragged else None
+    max_t_kv = ((torch.sum(seq_len_kv_gpu).item() + 63) // 64) * 64 if is_ragged else None
+
+    if is_dropout:
+        seed_gpu = torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda")
+        offset_gpu = torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda")
+
+    rng_dump_gpu = (
+        torch.zeros((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda")
+        if is_dropout
+        else None
+    )
+
+    if is_ragged:
+        (
+            q_ragged_offset_gpu,
+            k_ragged_offset_gpu,
+            v_ragged_offset_gpu,
+            o_ragged_offset_gpu,
+        ) = generate_ragged_offset(
+            layout,
+            head_group,
+            shape_q,
+            shape_k,
+            shape_v,
+            shape_o,
+            seq_len_q_gpu,
+            seq_len_kv_gpu,
+            cudnn_version
+        )
+
+    o_gpu = torch.empty(
+        b * h_q * s_q * d_v, dtype=input_type, device="cuda"
+    ).as_strided(shape_o, stride_o)
+    stats_gpu = torch.empty(b, h_q, s_q, 1, dtype=torch.float32, device="cuda")
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    sm_version = torch.cuda.get_device_capability()[0] * 10 + torch.cuda.get_device_capability()[1]
+    # forward cuDNN graph
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(input_type),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+        sm_version=sm_version
+    )
+
+    q = graph.tensor_like(q_gpu)
+    k = graph.tensor_like(k_gpu)
+    v = graph.tensor_like(v_gpu)
+
+    bias = graph.tensor_like(bias_gpu) if is_bias else None
+
+    seq_len_q = graph.tensor_like(seq_len_q_gpu) if is_padding else None
+    seq_len_kv = graph.tensor_like(seq_len_kv_gpu) if is_padding else None
+
+    if is_dropout:
+        seed = graph.tensor_like(seed_gpu)
+        offset = graph.tensor_like(offset_gpu)
+        dropout_tuple = (dropout_prob, seed, offset)
+
+    rng_dump = graph.tensor_like(rng_dump_gpu) if is_dropout else None
+
+    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None
+    k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None
+    v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None
+    o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None
+
+    if is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+
+    o, stats = graph.sdpa(
+        name="sdpa",
+        q=q,
+        k=k,
+        v=v,
+        generate_stats=True,
+        attn_scale=attn_scale,
+        bias=bias,
+        use_alibi_mask=is_alibi,
+        use_padding_mask=is_padding,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        diagonal_band_left_bound=left_bound, 
+        diagonal_band_right_bound=right_bound,
+        diagonal_alignment=diagonal_alignment,
+        dropout=dropout_tuple if is_dropout else None,
+        rng_dump=rng_dump,
+    )
+
+    o.set_output(True).set_dim(shape_o).set_stride(stride_o)
+    if is_ragged:
+        o.set_ragged_offset(o_ragged_offset)
+
+    stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    try:
+        graph.validate()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(repr(e))
+    except Exception as e:
+        pytest.fail(repr(e))
+
+    graph.build_operation_graph()
+    
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+    
+    graph.build_plans()
+
+    variant_pack = {
+        q: q_gpu,
+        k: k_gpu,
+        v: v_gpu,
+        bias: bias_gpu,
+        seq_len_q: seq_len_q_gpu,
+        seq_len_kv: seq_len_kv_gpu,
+        q_ragged_offset: q_ragged_offset_gpu if is_ragged else None,
+        k_ragged_offset: k_ragged_offset_gpu if is_ragged else None,
+        v_ragged_offset: v_ragged_offset_gpu if is_ragged else None,
+        o_ragged_offset: o_ragged_offset_gpu if is_ragged else None,
+        o: o_gpu,
+        stats: stats_gpu,
+        rng_dump: rng_dump_gpu,
+    }
+
+    if is_dropout:
+        variant_pack[seed] = seed_gpu
+        variant_pack[offset] = offset_gpu
+
+    workspace = torch.empty(
+        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+    )
+    graph.execute(variant_pack, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
+
+    if cudnn_version < "8.9.6" and is_padding:
+        # zero out padded region of the output and stats
+        for i, m in enumerate(seq_len_q_gpu):
+            o_gpu[i, :, m:, :] = 0
+            stats_gpu[i, :, m:, :] = 0
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+    sm_version = torch.cuda.get_device_capability()[0] * 10 + torch.cuda.get_device_capability()[1]
+
+    # backward cuDNN graph
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(input_type),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+        sm_version = sm_version
+    )
+
+    q = graph.tensor_like(q_gpu)
+    k = graph.tensor_like(k_gpu)
+    v = graph.tensor_like(v_gpu)
+    o = graph.tensor_like(o_gpu)
+    dO = graph.tensor_like(dO_gpu)
+    stats = graph.tensor_like(stats_gpu)
+
+    bias = graph.tensor_like(bias_gpu) if is_bias else None
+    dBias = (
+        graph.tensor_like(dBias_gpu).set_stride((h_q * s_q * s_kv, s_q * s_kv, s_kv, 1))
+        if is_bias
+        else None
+    )
+
+    seq_len_q = graph.tensor_like(seq_len_q_gpu) if is_padding else None
+    seq_len_kv = graph.tensor_like(seq_len_kv_gpu) if is_padding else None
+
+    if is_dropout:
+        seed = graph.tensor_like(seed_gpu)
+        offset = graph.tensor_like(offset_gpu)
+        dropout_tuple = (dropout_prob, seed, offset)
+
+    q_ragged_offset = graph.tensor_like(q_ragged_offset_gpu) if is_ragged else None
+    k_ragged_offset = graph.tensor_like(k_ragged_offset_gpu) if is_ragged else None
+    v_ragged_offset = graph.tensor_like(v_ragged_offset_gpu) if is_ragged else None
+    o_ragged_offset = graph.tensor_like(o_ragged_offset_gpu) if is_ragged else None
+
+    if is_ragged:
+        q.set_ragged_offset(q_ragged_offset)
+        k.set_ragged_offset(k_ragged_offset)
+        v.set_ragged_offset(v_ragged_offset)
+        o.set_ragged_offset(o_ragged_offset)
+        dO.set_ragged_offset(o_ragged_offset)
+
+    dQ, dK, dV = graph.sdpa_backward(
+        name="sdpa_backward",
+        q=q,
+        k=k,
+        v=v,
+        o=o,
+        dO=dO,
+        stats=stats,
+        attn_scale=attn_scale,
+        bias=bias,
+        dBias=dBias,
+        use_alibi_mask=is_alibi,
+        use_padding_mask=is_padding,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+        max_total_seq_len_q=max_t_q,
+        max_total_seq_len_kv=max_t_kv,
+        diagonal_band_left_bound=left_bound, 
+        diagonal_band_right_bound=right_bound,
+        diagonal_alignment=diagonal_alignment,
+        dropout=dropout_tuple if is_dropout else None,
+        use_deterministic_algorithm=is_deterministic,
+    )
+
+    dQ.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride())
+    dK.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride())
+    dV.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride())
+    if is_ragged:
+        dQ.set_ragged_offset(q_ragged_offset)
+        dK.set_ragged_offset(k_ragged_offset)
+        dV.set_ragged_offset(v_ragged_offset)
+
+    try:
+        graph.validate()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(repr(e))
+    except Exception as e:
+        pytest.fail(repr(e))
+
+    graph.build_operation_graph()
+    
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+    
+    graph.build_plans()
+
+    variant_pack = {
+        q: q_gpu,
+        k: k_gpu,
+        v: v_gpu,
+        o: o_gpu,
+        dO: dO_gpu,
+        stats: stats_gpu,
+        dQ: dQ_gpu,
+        dK: dK_gpu,
+        dV: dV_gpu,
+        bias: bias_gpu,
+        dBias: dBias_gpu,
+        seq_len_q: seq_len_q_gpu,
+        seq_len_kv: seq_len_kv_gpu,
+        q_ragged_offset: q_ragged_offset_gpu if is_ragged else None,
+        k_ragged_offset: k_ragged_offset_gpu if is_ragged else None,
+        v_ragged_offset: v_ragged_offset_gpu if is_ragged else None,
+        o_ragged_offset: o_ragged_offset_gpu if is_ragged else None,
+    }
+
+    if is_dropout:
+        variant_pack[seed] = seed_gpu
+        variant_pack[offset] = offset_gpu
+
+    workspace = torch.empty(
+        graph.get_workspace_size(), device="cuda", dtype=torch.uint8
+    )
+    graph.execute(variant_pack, workspace, handle=cudnn_handle)
+    torch.cuda.synchronize()
+
+    # compare with torch autograd reference
+    q_ref = q_gpu.detach().float().requires_grad_()
+    k_ref = k_gpu.detach().float().requires_grad_()
+    v_ref = v_gpu.detach().float().requires_grad_()
+    dO_ref = dO_gpu.detach().float()
+
+    if is_ragged:
+        q_ref = convert_ragged_to_uniform(q_ref, seq_len_q_gpu.detach())
+        k_ref = convert_ragged_to_uniform(k_ref, seq_len_kv_gpu.detach())
+        v_ref = convert_ragged_to_uniform(v_ref, seq_len_kv_gpu.detach())
+        dO_ref = convert_ragged_to_uniform(dO_ref, seq_len_q_gpu.detach())
+
+    if is_bias:
+        bias_ref = bias_gpu.detach().float().requires_grad_()
+
+    if is_padding:
+        seq_len_q_ref = seq_len_q_gpu.detach().flatten()
+        seq_len_kv_ref = seq_len_kv_gpu.detach().flatten()
+
+    if is_dropout:
+        rng_dump_ref = rng_dump_gpu.detach().float()
+
+    o_ref = compute_ref(
+        q_ref,
+        k_ref,
+        v_ref,
+        attn_scale=attn_scale,
+        bias=bias_ref if is_bias else None,
+        is_alibi=is_alibi,
+        padding=(seq_len_q_ref, seq_len_kv_ref) if is_padding else None,
+        sliding_window=(left_bound, right_bound),
+        diagonal_alignment=diagonal_alignment,
+        dropout_prob=dropout_prob,
+        dropout_mask=rng_dump_ref if is_dropout else None,
+        generate_stats=False,
+    )
+
+    outputs_ref = [o_ref]
+    inputs_ref = [q_ref, k_ref, v_ref]
+
+    if is_bias:
+        inputs_ref.append(bias_ref)
+
+    [dQ_ref, dK_ref, dV_ref, *opt_refs] = list(
+        torch.autograd.grad(outputs=outputs_ref, inputs=inputs_ref, grad_outputs=dO_ref)
+    )
+
+    if is_bias:
+        dBias_ref = opt_refs.pop(0)
+
+    if is_ragged:
+        dQ_gpu = convert_ragged_to_uniform(dQ_gpu, seq_len_q_gpu.detach())
+        dK_gpu = convert_ragged_to_uniform(dK_gpu, seq_len_kv_gpu.detach())
+        dV_gpu = convert_ragged_to_uniform(dV_gpu, seq_len_kv_gpu.detach())
+
+    if is_padding:
+        # zero out padded region of the output for comparison
+        for i, (m, n) in enumerate(zip(seq_len_q_ref, seq_len_kv_ref)):
+            dQ_ref[i, :, m:, :] = 0
+            dQ_gpu[i, :, m:, :] = 0
+            dK_ref[i, :, n:, :] = 0
+            dK_gpu[i, :, n:, :] = 0
+            dV_ref[i, :, n:, :] = 0
+            dV_gpu[i, :, n:, :] = 0
+
+    torch.cuda.synchronize()
+
+    torch.testing.assert_close(dQ_ref, dQ_gpu, check_dtype=False, atol=2e-2, rtol=2e-2)
+    torch.testing.assert_close(
+        dK_ref,
+        dK_gpu,
+        check_dtype=False,
+        atol=2e-2 if input_type != torch.bfloat16 else 7e-2,
+        rtol=2e-2,
+    )
+    torch.testing.assert_close(
+        dV_ref,
+        dV_gpu,
+        check_dtype=False,
+        atol=2e-2 if input_type != torch.bfloat16 else 7e-2,
+        rtol=2e-2,
+    )
+    if is_bias:
+        torch.testing.assert_close(
+            dBias_ref, dBias_gpu, check_dtype=False, atol=2e-2, rtol=2e-2
+        )
diff --git a/third_party/cudnn-frontend/test/python/test_mhas_v2.py b/third_party/cudnn-frontend/test/python/test_mhas_v2.py
new file mode 100644
index 00000000..9e3f6b97
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_mhas_v2.py
@@ -0,0 +1,721 @@
+"""
+This script tests cuDNN front-end attention.
+The recommended way to run tests:
+> pytest -vv -s -rA test_mhas_v2.py
+"""
+
+import cudnn
+import pytest
+import random
+import torch
+import sys
+from datetime import datetime
+
+from sdpa.random_config import (
+    ExecConfig,
+    generate_test_seeds,
+    RandomizationContext,
+    RandomBatchSize,
+    RandomBlockSize,
+    RandomSequenceLength,
+    RandomHiddenDimSize,
+    RandomHeadGenerator,
+    RandomChoice,
+    SlidingWindowMaskGenerator,
+)
+from sdpa.fp16 import exec_sdpa
+from sdpa.fp8 import exec_sdpa_fp8
+from sdpa.blocked import fetch_blocked_tests
+from sdpa.helpers import print_section_begin, print_section_end
+
+# fmt: off
+
+if __name__ == "__main__":
+    print("This is pytest script.")
+    sys.exit(0)
+
+class SDPATestConfig:
+    __slots__ = ['gpu_arch', 'gpu_info', 'cudnn_ver', 'blocked_tests', 'implementation', 'cfg']
+
+    def __init__(self, *, gpu_arch, gpu_info, cudnn_ver, blocked_tests, implementation):
+        assert type(gpu_arch) == type(gpu_info) == type(cudnn_ver) == str, "expecting strings as arguments"
+        assert isinstance(blocked_tests, list), "argument 'blocked_tests' must be list"
+
+        # Initialize all attributes to None.
+        for k in self.__slots__:
+            setattr(self, k, None)
+
+        self.gpu_arch      = gpu_arch
+        self.gpu_info      = gpu_info
+        self.cudnn_ver     = cudnn_ver
+        self.blocked_tests = blocked_tests
+
+        self.implementation = implementation
+
+        self.cfg = ExecConfig()
+
+
+    def showConfig(self, test_no, request):
+        is_dryrun = request.config.option.dryrun
+        print()
+        print_section_begin("DRY-RUN" if is_dryrun else "")
+        print(f"#### Test #{test_no[0]} of {test_no[1]} at", datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "\n")
+        print(f"test_name        = {request.node.name}")
+        print(f"platform_info    = {self.gpu_arch} ({self.gpu_info}), cudnn_ver={self.cudnn_ver}")
+        print()
+        print(self.cfg.to_repro_cmd(request.module.__file__))
+        print(flush=True)
+
+
+@pytest.fixture(scope="package")
+def env_info(request):
+    assert torch.cuda.is_available(), "no CUDA device"
+
+    gpu_type = torch.cuda.get_device_capability()
+    gpu_name = torch.cuda.get_device_name()
+    device   = torch.device('cuda:0')
+    sm_count = torch.cuda.get_device_properties(device).multi_processor_count
+
+    gpu_arch     = f"SM_{gpu_type[0]}{gpu_type[1]}"
+    gpu_info     = f"{sm_count} SM-s, {gpu_name}"
+    cudnn_ver    = str(torch.backends.cudnn.version())
+
+    blocked_tests = fetch_blocked_tests(gpu_arch, cudnn_ver)
+
+    return {"gpu_arch": gpu_arch, "gpu_info": gpu_info, "cudnn_ver": cudnn_ver, "blocked_tests": blocked_tests}
+
+# These options are common to all test lists
+data_type_options      = {torch.float16 : 1, torch.bfloat16 : 2}
+diag_alignment_options = [cudnn.diagonal_alignment.TOP_LEFT, cudnn.diagonal_alignment.BOTTOM_RIGHT]
+implementation_options = [cudnn.attention_implementation.AUTO, cudnn.attention_implementation.COMPOSITE, cudnn.attention_implementation.UNIFIED]
+implementation_names   = ['cudnn.attention_implementation.AUTO', 'cudnn.attention_implementation.COMPOSITE', 'cudnn.attention_implementation.UNIFIED']
+
+# # ==================================
+# # L0 fprop tests
+# # ==================================
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_fwd_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(causal=10, left_window_only=5, right_window_only=5, band_around_diag=10, no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=32, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_fwd_unified_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),  # Modified from non-unified test
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 0}),  # Modified from non-unified test
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+    test.cfg.implementation = getattr(cudnn.attention_implementation, request.config.getoption("--implementation") or "", cudnn.attention_implementation.UNIFIED)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+# # ==================================
+# # L0 bprop tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=256, rng_seed=844), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_bwd_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=8, max=16),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=192, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":5, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(causal=10, left_window_only=5, right_window_only=5, band_around_diag=10, no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 4, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+        is_deterministic=RandomChoice({True : 3, False : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.cfg.is_infer = False
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+# # ==================================
+# # L0 fprop tests with s_q=1
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=111), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_sq1_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=32),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":100, "s_q=s_kv":1, "s_q=random":0}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=32, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 0, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=32, rng_seed=111), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_sq1_unified_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=32),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":100, "s_q=s_kv":1, "s_q=random":0}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=32, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 0}),  # Modified from non-unified test
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 0, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+    test.cfg.implementation = getattr(cudnn.attention_implementation, request.config.getoption("--implementation") or "", cudnn.attention_implementation.UNIFIED)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+# # =====================================================
+# # L0 lean attention, s_kv=513..2048
+# # =====================================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=222), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_lean_attn_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=32),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1, s_kv_min=513, s_kv_max=2048, s_q_distribution={"s_q=1":100, "s_q=s_kv":0, "s_q=random":0}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=32, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=222), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_lean_attn_unified_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=32),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1, s_kv_min=513, s_kv_max=2048, s_q_distribution={"s_q=1":100, "s_q=s_kv":0, "s_q=random":0}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=32, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 0}),  # Modified from non-unified test
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+    test.cfg.implementation = getattr(cudnn.attention_implementation, request.config.getoption("--implementation") or "", cudnn.attention_implementation.UNIFIED)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+# # ==================================
+# # L0 ragged tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_fwd_ragged_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(causal=10, left_window_only=5, right_window_only=5, band_around_diag=10, no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 1, "padded" : 0, "full" : 0}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_fwd_ragged_unified_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),  # Modified from non-unified test
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 0}),  # Modified from non-unified test
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 1, "padded" : 0, "full" : 0}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+    test.cfg.implementation = getattr(cudnn.attention_implementation, request.config.getoption("--implementation") or "", cudnn.attention_implementation.UNIFIED)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=256, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_bwd_ragged_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=8, max=16),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=192, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":5, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(causal=10, left_window_only=5, right_window_only=5, band_around_diag=10, no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 1, "padded" : 0, "full" : 0}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+        is_deterministic=RandomChoice({True : 3, False : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.cfg.is_infer = False
+    test.showConfig(test_no, request)
+
+    if request.node.name in test.blocked_tests:
+        pytest.skip(f"blocked test: {request.node.name}")
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+# # ==================================
+# # L0 paged tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_fwd_paged_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=64, s_kv_min=1, s_kv_max=512, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(causal=10, left_window_only=5, right_window_only=5, band_around_diag=10, no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 0}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+        block_size=RandomBlockSize(min=1, max=1024, with_high_probability=[1,32,128]),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+        test.cfg.is_paged = True
+        test.cfg.implementation=cudnn.attention_implementation.COMPOSITE  # FIXNOW
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_fwd_paged_unified_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=64, s_kv_min=1, s_kv_max=512, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),  # Modified from non-unified test
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 0}),  # Modified from non-unified test
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 0}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+        block_size=RandomBlockSize(min=1, max=1024, with_high_probability=[1,32,128]),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+        test.cfg.is_paged = True
+    test.cfg.implementation = getattr(cudnn.attention_implementation, request.config.getoption("--implementation") or "", cudnn.attention_implementation.UNIFIED)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+# # ==================================
+# # L0 fprop block mask tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=32, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_fwd_unified_block_mask_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1, cudnn.diagonal_alignment.BOTTOM_RIGHT : 0}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 0, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+        test.cfg.is_block_mask = True
+    test.cfg.implementation = cudnn.attention_implementation.UNIFIED
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+# # ==================================
+# # L0 fprop bias tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=32, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_fwd_bias_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=8, with_high_probability=[1,4]),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=128, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":1, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=1),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 1, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+        is_bias=RandomChoice({True : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+# # ==================================
+# # L0 bprop bias tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=32, rng_seed=888), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_random_bwd_bias_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    # Create the randomization context within the test
+    with RandomizationContext(
+        batches=RandomBatchSize(min=8, max=16),
+        s_q_s_kv = RandomSequenceLength(s_q_min=1, s_q_max=1024, s_kv_min=1, s_kv_max=1024, s_q_distribution={"s_q=1":0, "s_q=s_kv":5, "s_q=random":10}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=1, d_qk_max=192, d_v_min=1, d_v_max=128, head_dim_distribution={"d_qk=d_v":5, "d_qk=random":1}, with_high_probability=[(64,64), (128,128), (192,128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float16 : 1, torch.bfloat16 : 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT : 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged" : 0, "padded" : 4, "full" : 1}),
+        stats_layout=RandomChoice({"ragged" : 0, "full" : 0, "disabled" : 1}),
+        is_bias=RandomChoice({True : 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.cfg.is_infer = False
+    test.showConfig(test_no, request)
+
+    exec_sdpa(test.cfg, request, cudnn_handle)
+
+
+# # ==================================
+# # L0 FP8 fprop tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=128, rng_seed=999), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_fp8_fwd_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=4, with_high_probability=[1, 2]),
+        s_q_s_kv=RandomSequenceLength(s_q_min=1, s_q_max=256, s_kv_min=64, s_kv_max=1024, s_q_distribution={"s_q=1": 3, "s_q=s_kv": 5, "s_q=random": 2}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=64, d_qk_max=192, d_v_min=64, d_v_max=128, head_dim_distribution={"d_qk=d_v": 2, "d_qk=random": 1}, with_high_probability=[(64, 64), (128, 128), (192, 128)]),
+        head_count=RandomHeadGenerator(min=1, max=16, head_group_options=(1, 5, 2)),
+        data_type=RandomChoice({torch.float8_e4m3fn: 2, torch.float8_e5m2: 1}),
+        output_type=RandomChoice({torch.float8_e4m3fn: 1, torch.float8_e5m2: 1, torch.float16: 2}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT: 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged": 0, "padded": 0, "full": 1}),
+        stats_layout=RandomChoice({"disabled": 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+    test.showConfig(test_no, request)
+
+    if request.node.name in test.blocked_tests:
+        pytest.skip(f"blocked test: {request.node.name}")
+    exec_sdpa_fp8(test.cfg, request, cudnn_handle)
+
+
+# # ==================================
+# # L0 FP8 bprop tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=64, rng_seed=998), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_fp8_bwd_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=4, with_high_probability=[1, 2]),
+        s_q_s_kv=RandomSequenceLength(s_q_min=64, s_q_max=256, s_kv_min=64, s_kv_max=256, s_q_distribution={"s_q=1": 0, "s_q=s_kv": 5, "s_q=random": 5}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=64, d_qk_max=128, d_v_min=64, d_v_max=128, head_dim_distribution={"d_qk=d_v": 1, "d_qk=random": 0}, with_high_probability=[(64, 64), (128, 128)]),
+        head_count=RandomHeadGenerator(min=1, max=8, head_group_options=(1, 4, 1)),
+        data_type=RandomChoice({torch.float8_e4m3fn: 1}),
+        output_type=RandomChoice({torch.float8_e4m3fn: 1, torch.float16: 1}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT: 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged": 0, "padded": 0, "full": 1}),
+        stats_layout=RandomChoice({"disabled": 1}),
+        is_deterministic=RandomChoice({True: 1, False: 1}),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+
+    test.cfg.is_infer = False
+    test.showConfig(test_no, request)
+
+    if request.node.name in test.blocked_tests:
+        pytest.skip(f"blocked test: {request.node.name}")
+    exec_sdpa_fp8(test.cfg, request, cudnn_handle)
+
+
+# # ==================================
+# # L0 FP8 paged attention tests
+# # ==================================
+
+@pytest.mark.parametrize("test_no", generate_test_seeds(num_tests=32, rng_seed=997), ids=lambda p: f"test{p[0]}")
+@pytest.mark.L0
+def test_sdpa_fp8_fwd_paged_L0(env_info, test_no, request, cudnn_handle):
+
+    test = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+
+    geom_seed = abs(hash(test_no))
+    data_seed = test_no[2]
+
+    rng = random.Random(geom_seed)
+
+    with RandomizationContext(
+        batches=RandomBatchSize(min=1, max=4, with_high_probability=[1, 2]),
+        s_q_s_kv=RandomSequenceLength(s_q_min=64, s_q_max=256, s_kv_min=64, s_kv_max=512, s_q_distribution={"s_q=1": 0, "s_q=s_kv": 5, "s_q=random": 5}),
+        d_qk_d_v=RandomHiddenDimSize(d_qk_min=64, d_qk_max=128, d_v_min=64, d_v_max=128, head_dim_distribution={"d_qk=d_v": 1, "d_qk=random": 0}, with_high_probability=[(64, 64), (128, 128)]),
+        head_count=RandomHeadGenerator(min=1, max=4, head_group_options=(1, 2, 0)),
+        data_type=RandomChoice({torch.float8_e4m3fn: 2, torch.float8_e5m2: 1}),
+        output_type=RandomChoice({torch.float8_e4m3fn: 1, torch.float8_e5m2: 1, torch.float16: 1}),
+        with_sliding_mask=SlidingWindowMaskGenerator(no_mask=10),
+        diag_align=RandomChoice({cudnn.diagonal_alignment.TOP_LEFT: 1}),
+        is_q_ragged_or_padded_or_full=RandomChoice({"ragged": 0, "padded": 1, "full": 0}),
+        stats_layout=RandomChoice({"disabled": 1}),
+        block_size=RandomBlockSize(min=16, max=128, with_high_probability=[16, 32, 64]),
+    ) as randomization_ctx:
+        test.cfg = randomization_ctx(rng, data_seed, geom_seed)
+        test.cfg.is_paged = True
+    test.showConfig(test_no, request)
+
+    if request.node.name in test.blocked_tests:
+        pytest.skip(f"blocked test: {request.node.name}")
+    exec_sdpa_fp8(test.cfg, request, cudnn_handle)
+
+
+# # ===================
+# # Single repro test
+# # ===================
+
+@pytest.mark.skipif("not config.getoption('--repro')", reason="used with '--repro' only")
+@pytest.mark.L0
+@pytest.mark.L1
+@pytest.mark.L2
+@pytest.mark.L3
+@pytest.mark.L4
+def test_repro(env_info, request, cudnn_handle):
+    import ast
+    repro_str = request.config.getoption("--repro")
+    cfg = SDPATestConfig(**env_info, implementation=cudnn.attention_implementation.AUTO)
+    cfg.cfg = ExecConfig.deserialize(ast.literal_eval(repro_str))
+    cfg.showConfig((1,1), request)
+    exec_sdpa(cfg.cfg, request, cudnn_handle)
diff --git a/third_party/cudnn-frontend/test/python/test_rmsnorm.py b/third_party/cudnn-frontend/test/python/test_rmsnorm.py
new file mode 100644
index 00000000..43d1997f
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_rmsnorm.py
@@ -0,0 +1,215 @@
+import cudnn
+import pytest
+import torch
+import itertools
+from looseversion import LooseVersion
+
+import torch.nn as nn
+
+from test_utils import torch_fork_set_rng
+
+
+class RMSNorm(torch.nn.Module):
+    """Root Mean Square Layer Normalization.
+
+    Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
+    https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
+    """
+
+    def __init__(self, dim: int = -1, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor = None) -> torch.Tensor:
+        # NOTE: the original RMSNorm paper implementation is not equivalent
+        norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
+        inv_var = torch.rsqrt(norm_x.float() + self.eps)
+        x_normed = x * inv_var.to(x.dtype)
+        x_scaled = weight * x_normed
+        if bias is not None:
+            x_scaled += bias
+        return x_scaled, inv_var
+
+
+embedding_dim_options = [768, 1024, 1280, 1600]
+input_type_options = [torch.float16, torch.bfloat16]
+bias_options = [True, False]
+
+all_options = [elem for elem in itertools.product(*[embedding_dim_options, input_type_options, bias_options])]
+
+
+@pytest.fixture(params=all_options)
+def param_extract(request):
+    return request.param
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.9.6",
+    reason="RmsNorm not supported below cudnn 8.9.6",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_rmsnorm(param_extract, cudnn_handle):
+
+    embedding_dim, input_type, has_bias = param_extract
+
+    batch_size, seq_size = 16, 128
+    N, C, H, W = batch_size * seq_size, embedding_dim, 1, 1
+
+    epsilon_value = 1e-3
+
+    x_gpu = 2 * torch.randn(N, C, H, W, requires_grad=True, device="cuda", dtype=input_type) - 1.25
+    scale_gpu = 3 * torch.randn(1, C, H, W, requires_grad=True, device="cuda", dtype=input_type) - 2.75
+    bias_gpu = torch.randn(1, C, H, W, requires_grad=True, device="cuda", dtype=input_type)
+    epsilon_cpu = torch.full(
+        (1, 1, 1, 1),
+        epsilon_value,
+        requires_grad=False,
+        device="cpu",
+        dtype=torch.float32,
+    )
+
+    print("Running reference")
+
+    model = RMSNorm(eps=epsilon_value, dim=(1, 2, 3)).float()
+    Y_expected, inv_var_expected = model(x_gpu, scale_gpu, bias_gpu if has_bias else None)
+
+    print("Building cudnn graph")
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    X = graph.tensor_like(x_gpu.detach())
+    scale = graph.tensor_like(scale_gpu.detach())
+    bias = graph.tensor_like(bias_gpu.detach()) if has_bias else None
+    epsilon = graph.tensor_like(epsilon_cpu)
+
+    Y, inv_var = graph.rmsnorm(
+        name="RMS",
+        norm_forward_phase=cudnn.norm_forward_phase.TRAINING,
+        input=X,
+        scale=scale,
+        bias=bias,
+        epsilon=epsilon,
+    )
+
+    Y.set_output(True).set_data_type(x_gpu.dtype)
+    inv_var.set_output(True).set_data_type(inv_var_expected.dtype)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans()
+
+    Y_actual = torch.empty_like(x_gpu)
+    inv_var_actual = torch.empty_like(inv_var_expected)
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    print("Executing cudnn graph")
+
+    graph.execute(
+        {
+            X: x_gpu.detach(),
+            scale: scale_gpu.detach(),
+            bias: bias_gpu.detach(),
+            epsilon: epsilon_cpu,
+            Y: Y_actual,
+            inv_var: inv_var_actual,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    torch.testing.assert_close(Y_expected, Y_actual, atol=0.03125, rtol=0.03125)
+    torch.testing.assert_close(inv_var_expected, inv_var_actual, atol=0.005, rtol=0.005)
+
+    target = torch.randn_like(Y_expected)
+    criterion = nn.MSELoss()
+    loss = criterion(Y_expected, target)
+
+    Y_expected.retain_grad()
+    x_gpu.retain_grad()
+    scale_gpu.retain_grad()
+    bias_gpu.retain_grad()
+
+    loss.backward()
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    bwd_graph = cudnn.pygraph(
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+    )
+
+    DY = bwd_graph.tensor_like(Y_expected.grad)
+    X_bwd = bwd_graph.tensor_like(x_gpu.detach())
+    scale_bwd = bwd_graph.tensor_like(scale_gpu.detach())
+    inv_var_bwd = bwd_graph.tensor_like(inv_var_actual)
+
+    DX, Dscale, Dbias = bwd_graph.rmsnorm_backward(
+        name="DRMS",
+        grad=DY,
+        input=X_bwd,
+        scale=scale_bwd,
+        inv_variance=inv_var_bwd,
+        has_dbias=has_bias,
+    )
+
+    DX.set_output(True).set_data_type(x_gpu.dtype)
+    Dscale.set_output(True).set_data_type(x_gpu.dtype)
+    if has_bias:
+        Dbias.set_output(True).set_data_type(x_gpu.dtype)
+    else:
+        assert Dbias is None
+
+    bwd_graph.validate()
+    bwd_graph.build_operation_graph()
+    bwd_graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    bwd_graph.check_support()
+    bwd_graph.build_plans()
+
+    DX_actual = torch.empty_like(x_gpu)
+    DScale_actual = torch.empty_like(scale_gpu)
+    Dbias_actual = torch.empty_like(bias_gpu)
+
+    workspace = torch.empty(bwd_graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    print("Executing cudnn bwd_graph")
+
+    bwd_graph.execute(
+        {
+            X_bwd: x_gpu.detach(),
+            scale_bwd: scale_gpu.detach(),
+            DY: Y_expected.grad,
+            inv_var_bwd: inv_var_actual,
+            DX: DX_actual,
+            Dscale: DScale_actual,
+            Dbias: Dbias_actual,
+        },
+        workspace,
+        handle=cudnn_handle,
+    )
+
+    torch.cuda.synchronize()
+    print("Comparing with reference")
+    torch.testing.assert_close(x_gpu.grad, DX_actual, atol=2e-4, rtol=2e-4)
+    torch.testing.assert_close(scale_gpu.grad, DScale_actual, atol=5e-4, rtol=5e-4)
+    if has_bias:
+        torch.testing.assert_close(bias_gpu.grad, Dbias_actual, atol=5e-4, rtol=5e-4)
+    print("Success!!")
diff --git a/third_party/cudnn-frontend/test/python/test_sdpa_chunked_prefill.py b/third_party/cudnn-frontend/test/python/test_sdpa_chunked_prefill.py
new file mode 100644
index 00000000..492906e8
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_sdpa_chunked_prefill.py
@@ -0,0 +1,619 @@
+"""
+Test for SDPA with chunked prefill using THD (Token-Head-Dimension) layout.
+
+Chunked prefill processes long sequences by splitting them into smaller chunks
+to reduce memory usage. For a sequence of 4096 tokens with chunk_size=1024:
+- Chunk 0: Q[0:1024] attends to K[0:1024], V[0:1024]
+- Chunk 1: Q[1024:2048] attends to K[0:2048], V[0:2048]
+- Chunk 2: Q[2048:3072] attends to K[0:3072], V[0:3072]
+- Chunk 3: Q[3072:4096] attends to K[0:4096], V[0:4096]
+
+THD layout is a ragged/packed format where:
+- Q: [chunk_tokens, num_heads, head_dim] - packed Q tensor for current chunk
+- K/V: BHSD format [batch, heads, accumulated_seq_len, head_dim]
+- O: [chunk_tokens, num_heads, head_dim] - packed output tensor
+
+The recommended way to run tests:
+> pytest -vv -s -rA test_sdpa_chunked_prefill.py
+"""
+
+import cudnn
+import pytest
+import torch
+import math
+from looseversion import LooseVersion
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+from test_utils import torch_fork_set_rng
+from enum import Enum, auto
+
+
+class UIDs(Enum):
+    Q_UID = auto()
+    K_UID = auto()
+    V_UID = auto()
+    O_UID = auto()
+    RAGGED_Q_UID = auto()
+    RAGGED_O_UID = auto()
+    ACTUAL_SEQ_LENS_Q_UID = auto()
+    ACTUAL_SEQ_LENS_KV_UID = auto()
+
+
+@dataclass
+class ChunkedPrefillConfig:
+    batch_size: int
+    num_heads_q: int
+    num_heads_k: int
+    num_heads_v: int
+    head_dim_qk: int
+    head_dim_v: int
+    total_seq_len: int
+    chunk_size: int
+    dtype: torch.dtype = torch.bfloat16
+    is_causal: bool = False
+    attn_scale: Optional[float] = None
+
+    def __post_init__(self):
+        if self.attn_scale is None:
+            self.attn_scale = 1.0 / math.sqrt(self.head_dim_qk)
+        assert self.total_seq_len % self.chunk_size == 0
+
+    @property
+    def num_chunks(self) -> int:
+        return self.total_seq_len // self.chunk_size
+
+
+def convert_to_cudnn_type(torch_type):
+    type_map = {
+        torch.float16: cudnn.data_type.HALF,
+        torch.bfloat16: cudnn.data_type.BFLOAT16,
+        torch.float32: cudnn.data_type.FLOAT,
+        torch.int32: cudnn.data_type.INT32,
+        torch.int64: cudnn.data_type.INT64,
+    }
+    return type_map[torch_type]
+
+
+def compute_ragged_offsets(seq_lens, num_heads, head_dim):
+    batch_size = seq_lens.shape[0]
+    elements_per_batch = seq_lens * num_heads * head_dim
+    ragged_offset = torch.zeros(batch_size + 1, dtype=torch.int64, device=seq_lens.device)
+    ragged_offset[1:] = torch.cumsum(elements_per_batch, dim=0)
+    return ragged_offset.view(-1, 1, 1, 1)
+
+
+def create_thd_tensor(seq_lens, num_heads, head_dim, dtype, rng, mean=0.0, std=1.0):
+    total_tokens = int(seq_lens.sum().item())
+    tensor = torch.empty(total_tokens, num_heads, head_dim, dtype=dtype, device="cuda")
+    tensor.normal_(mean=mean, std=std, generator=rng)
+    ragged_offset = compute_ragged_offsets(seq_lens, num_heads, head_dim)
+    return tensor, ragged_offset
+
+
+def create_bhsd_tensor(batch_size, num_heads, max_seq_len, head_dim, dtype, rng, mean=0.0, std=1.0):
+    total_elements = batch_size * max_seq_len * num_heads * head_dim
+    storage = torch.empty(total_elements, dtype=dtype, device="cuda")
+    storage.normal_(mean=mean, std=std, generator=rng)
+    strides = (max_seq_len * num_heads * head_dim, head_dim, num_heads * head_dim, 1)
+    return torch.as_strided(storage, (batch_size, num_heads, max_seq_len, head_dim), strides)
+
+
+def thd_to_bhsd(thd_tensor, seq_lens, max_seq_len):
+    batch_size = seq_lens.shape[0]
+    _, num_heads, head_dim = thd_tensor.shape
+    storage = torch.zeros(batch_size, max_seq_len, num_heads, head_dim, dtype=thd_tensor.dtype, device=thd_tensor.device)
+    offset = 0
+    for i in range(batch_size):
+        seq_len = int(seq_lens[i].item())
+        storage[i, :seq_len, :, :] = thd_tensor[offset : offset + seq_len]
+        offset += seq_len
+    return storage.permute(0, 2, 1, 3)
+
+
+def bhsd_to_thd(bhsd_tensor, seq_lens):
+    batch_size = seq_lens.shape[0]
+    _, num_heads, _, head_dim = bhsd_tensor.shape
+    total_tokens = int(seq_lens.sum().item())
+    thd_tensor = torch.empty(total_tokens, num_heads, head_dim, dtype=bhsd_tensor.dtype, device=bhsd_tensor.device)
+    bshd_tensor = bhsd_tensor.permute(0, 2, 1, 3)
+    offset = 0
+    for i in range(batch_size):
+        seq_len = int(seq_lens[i].item())
+        thd_tensor[offset : offset + seq_len] = bshd_tensor[i, :seq_len, :, :]
+        offset += seq_len
+    return thd_tensor
+
+
+def compute_sdpa_reference_with_offset(q_bhsd, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, attn_scale, is_causal=False, causal_offset=0):
+    batch_size, num_heads_q, max_seq_q, head_dim_qk = q_bhsd.shape
+    _, num_heads_k, max_seq_kv, _ = k_bhsd.shape
+    _, num_heads_v, _, head_dim_v = v_bhsd.shape
+
+    q = q_bhsd.to(dtype=torch.float32)
+    k = k_bhsd.to(dtype=torch.float32)
+    v = v_bhsd.to(dtype=torch.float32)
+
+    if num_heads_q != num_heads_k:
+        k = k.unsqueeze(2).expand(-1, -1, num_heads_q // num_heads_k, -1, -1).reshape(batch_size, num_heads_q, max_seq_kv, head_dim_qk)
+    if num_heads_q != num_heads_v:
+        v = v.unsqueeze(2).expand(-1, -1, num_heads_q // num_heads_v, -1, -1).reshape(batch_size, num_heads_q, max_seq_kv, head_dim_v)
+
+    scores = torch.einsum("bhqd,bhkd->bhqk", q, k) * attn_scale
+
+    device = q.device
+    q_mask = torch.zeros(batch_size, 1, max_seq_q, 1, dtype=torch.bool, device=device)
+    kv_mask = torch.zeros(batch_size, 1, 1, max_seq_kv, dtype=torch.bool, device=device)
+    for i in range(batch_size):
+        q_mask[i, :, seq_len_q[i] :, :] = True
+        kv_mask[i, :, :, seq_len_kv[i] :] = True
+
+    scores = scores.masked_fill(kv_mask, float("-inf"))
+
+    if is_causal:
+        # For chunked prefill, Q position q (in chunk) corresponds to global position (causal_offset + q)
+        # Q can attend to K[0:causal_offset+q+1], so mask K[k] when k > causal_offset + q
+        # This means k - q > causal_offset, or k - q >= causal_offset + 1
+        # triu_(diagonal=d) sets True where (col - row) >= d
+        causal_mask = torch.ones(max_seq_q, max_seq_kv, dtype=torch.bool, device=device)
+        causal_mask.triu_(diagonal=1 + causal_offset)
+        scores = scores.masked_fill(causal_mask, float("-inf"))
+
+    attn_weights = torch.softmax(scores, dim=-1)
+    attn_weights = attn_weights.masked_fill(q_mask, 0.0)
+    output = torch.einsum("bhqk,bhkd->bhqd", attn_weights, v)
+
+    output_mask = torch.zeros(batch_size, 1, max_seq_q, 1, dtype=torch.bool, device=device)
+    for i in range(batch_size):
+        output_mask[i, :, seq_len_q[i] :, :] = True
+    return output.masked_fill(output_mask, 0.0)
+
+
+def compute_chunked_prefill_reference(q_bhsd, k_bhsd, v_bhsd, config, attn_scale):
+    batch_size, num_heads_q, total_seq, _ = q_bhsd.shape
+    head_dim_v = v_bhsd.shape[3]
+    chunk_size = config.chunk_size
+    num_chunks = config.num_chunks
+    device = q_bhsd.device
+
+    output = torch.zeros(batch_size, num_heads_q, total_seq, head_dim_v, dtype=torch.float32, device=device)
+
+    for chunk_idx in range(num_chunks):
+        q_start = chunk_idx * chunk_size
+        q_end = q_start + chunk_size
+        kv_end = q_end
+
+        q_chunk = q_bhsd[:, :, q_start:q_end, :]
+        k_chunk = k_bhsd[:, :, :kv_end, :]
+        v_chunk = v_bhsd[:, :, :kv_end, :]
+
+        seq_len_q = torch.full((batch_size,), chunk_size, dtype=torch.int32, device=device)
+        seq_len_kv = torch.full((batch_size,), kv_end, dtype=torch.int32, device=device)
+
+        o_chunk = compute_sdpa_reference_with_offset(
+            q_chunk, k_chunk, v_chunk, seq_len_q, seq_len_kv, attn_scale, is_causal=config.is_causal, causal_offset=q_start
+        )
+        output[:, :, q_start:q_end, :] = o_chunk
+
+    return output
+
+
+graph_cache = {}
+
+
+def build_cudnn_sdpa_chunk_graph(cudnn_handle, batch_size, h_q, h_k, h_v, d_qk, d_v, chunk_size, kv_seq_len, dtype, attn_scale, is_causal, causal_offset=0):
+    cudnn_dtype = convert_to_cudnn_type(dtype)
+    cache_key = (batch_size, h_q, h_k, h_v, d_qk, d_v, chunk_size, kv_seq_len, is_causal, causal_offset)
+
+    if cache_key in graph_cache:
+        return graph_cache[cache_key]
+
+    graph = cudnn.pygraph(
+        io_data_type=cudnn_dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+        is_dynamic_shape_enabled=True,
+    )
+
+    q = graph.tensor(dim=(batch_size, h_q, chunk_size, d_qk), stride=(h_q * d_qk, d_qk, h_q * d_qk, 1), data_type=cudnn_dtype, name="Q", uid=UIDs.Q_UID.value)
+    q_ragged = graph.tensor(
+        dim=(batch_size + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64, name="Q_ragged_offset", uid=UIDs.RAGGED_Q_UID.value
+    )
+    q.set_ragged_offset(q_ragged)
+
+    k = graph.tensor(
+        dim=(batch_size, h_k, kv_seq_len, d_qk), stride=(h_k * kv_seq_len * d_qk, d_qk, h_k * d_qk, 1), data_type=cudnn_dtype, name="K", uid=UIDs.K_UID.value
+    )
+    v = graph.tensor(
+        dim=(batch_size, h_v, kv_seq_len, d_v), stride=(h_v * kv_seq_len * d_v, d_v, h_v * d_v, 1), data_type=cudnn_dtype, name="V", uid=UIDs.V_UID.value
+    )
+
+    seq_len_q_tensor = graph.tensor(
+        dim=(batch_size, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32, name="seq_len_q", uid=UIDs.ACTUAL_SEQ_LENS_Q_UID.value
+    )
+    seq_len_kv_tensor = graph.tensor(
+        dim=(batch_size, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT32, name="seq_len_kv", uid=UIDs.ACTUAL_SEQ_LENS_KV_UID.value
+    )
+
+    # For chunked prefill with causal masking, use diagonal_band_right_bound to shift the causal diagonal
+    # right_bound = causal_offset means Q[i] can attend to K[0:causal_offset+i+1]
+    # This correctly handles chunks where Q positions represent later positions in the full sequence
+    o, stats = graph.sdpa(
+        name="sdpa_chunk",
+        q=q,
+        k=k,
+        v=v,
+        attn_scale=attn_scale,
+        use_padding_mask=True,
+        seq_len_q=seq_len_q_tensor,
+        seq_len_kv=seq_len_kv_tensor,
+        diagonal_band_right_bound=causal_offset if is_causal else None,
+        generate_stats=False,
+    )
+
+    o.set_output(True).set_dim((batch_size, h_q, chunk_size, d_v)).set_stride((h_q * d_v, d_v, h_q * d_v, 1)).set_data_type(cudnn_dtype)
+    o.set_uid(UIDs.O_UID.value)
+
+    o_ragged = graph.tensor(
+        dim=(batch_size + 1, 1, 1, 1), stride=(1, 1, 1, 1), data_type=cudnn.data_type.INT64, name="O_ragged_offset", uid=UIDs.RAGGED_O_UID.value
+    )
+    o.set_ragged_offset(o_ragged)
+
+    try:
+        graph.validate()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(f"Graph not supported: {e}")
+
+    try:
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(f"Graph not supported after validation: {e}")
+
+    graph_cache[cache_key] = graph
+    return graph
+
+
+def execute_cudnn_sdpa_chunk(
+    cudnn_handle, graph, q_chunk, k_chunk, v_chunk, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset, dtype, h_q, d_qk, d_v, batch_size, chunk_size
+):
+    total_q_tokens = q_chunk.shape[0]
+    o_chunk = torch.empty(total_q_tokens, h_q, d_v, dtype=dtype, device="cuda")
+
+    seq_len_q_4d = seq_len_q.view(-1, 1, 1, 1)
+    seq_len_kv_4d = seq_len_kv.view(-1, 1, 1, 1)
+
+    variant_pack = {
+        UIDs.Q_UID.value: q_chunk,
+        UIDs.RAGGED_Q_UID.value: q_ragged_offset,
+        UIDs.K_UID.value: k_chunk,
+        UIDs.V_UID.value: v_chunk,
+        UIDs.ACTUAL_SEQ_LENS_Q_UID.value: seq_len_q_4d,
+        UIDs.ACTUAL_SEQ_LENS_KV_UID.value: seq_len_kv_4d,
+        UIDs.O_UID.value: o_chunk,
+        UIDs.RAGGED_O_UID.value: o_ragged_offset,
+    }
+
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+    cudnn.set_stream(handle=cudnn_handle, stream=torch.cuda.current_stream().cuda_stream)
+
+    q_chunk_shape = (batch_size, h_q, chunk_size, d_qk)
+    o_chunk_shape = (batch_size, h_q, chunk_size, d_v)
+
+    override_uids = [
+        UIDs.Q_UID.value,
+        UIDs.RAGGED_Q_UID.value,
+        UIDs.K_UID.value,
+        UIDs.V_UID.value,
+        UIDs.ACTUAL_SEQ_LENS_Q_UID.value,
+        UIDs.ACTUAL_SEQ_LENS_KV_UID.value,
+        UIDs.O_UID.value,
+        UIDs.RAGGED_O_UID.value,
+    ]
+    override_shapes = [
+        q_chunk_shape,
+        q_ragged_offset.shape,
+        k_chunk.shape,
+        v_chunk.shape,
+        seq_len_q_4d.shape,
+        seq_len_kv_4d.shape,
+        o_chunk_shape,
+        o_ragged_offset.shape,
+    ]
+    override_strides = [
+        q_chunk.stride(),
+        q_ragged_offset.stride(),
+        k_chunk.stride(),
+        v_chunk.stride(),
+        seq_len_q_4d.stride(),
+        seq_len_kv_4d.stride(),
+        o_chunk.stride(),
+        o_ragged_offset.stride(),
+    ]
+
+    graph.execute(variant_pack, workspace, handle=cudnn_handle, override_uids=override_uids, override_shapes=override_shapes, override_strides=override_strides)
+    torch.cuda.synchronize()
+    return o_chunk
+
+
+def create_bhsd_view(tensor, batch_size, num_heads, seq_len, head_dim):
+    if tensor.shape == (batch_size, num_heads, seq_len, head_dim):
+        bshd = tensor.permute(0, 2, 1, 3).contiguous()
+        strides = (seq_len * num_heads * head_dim, head_dim, num_heads * head_dim, 1)
+        return torch.as_strided(bshd.view(-1), (batch_size, num_heads, seq_len, head_dim), strides)
+    storage = tensor.contiguous().view(-1)
+    strides = (seq_len * num_heads * head_dim, head_dim, num_heads * head_dim, 1)
+    return torch.as_strided(storage, (batch_size, num_heads, seq_len, head_dim), strides)
+
+
+def extract_thd_chunk(thd_tensor, batch_size, total_seq_len, chunk_idx, chunk_size):
+    """
+    Extract a chunk from THD tensor for all batches.
+
+    THD layout packs all tokens of batch 0 first, then batch 1, etc.
+    For chunk_idx, we need positions [chunk_idx*chunk_size : (chunk_idx+1)*chunk_size]
+    from each batch.
+
+    Args:
+        thd_tensor: [total_tokens, num_heads, head_dim] - packed tensor
+        batch_size: number of batches
+        total_seq_len: sequence length per batch
+        chunk_idx: which chunk to extract (0-indexed)
+        chunk_size: size of each chunk
+
+    Returns:
+        chunk: [batch_size * chunk_size, num_heads, head_dim]
+    """
+    chunk_start = chunk_idx * chunk_size
+    chunk_end = chunk_start + chunk_size
+
+    chunks = []
+    for b in range(batch_size):
+        batch_offset = b * total_seq_len
+        chunks.append(thd_tensor[batch_offset + chunk_start : batch_offset + chunk_end, :, :])
+
+    return torch.cat(chunks, dim=0)
+
+
+def store_thd_chunk(o_full_thd, o_chunk, batch_size, total_seq_len, chunk_idx, chunk_size):
+    """
+    Store a chunk back into the full THD output tensor.
+
+    Args:
+        o_full_thd: [total_tokens, num_heads, head_dim] - output tensor to fill
+        o_chunk: [batch_size * chunk_size, num_heads, head_dim] - chunk output
+        batch_size: number of batches
+        total_seq_len: sequence length per batch
+        chunk_idx: which chunk (0-indexed)
+        chunk_size: size of each chunk
+    """
+    chunk_start = chunk_idx * chunk_size
+    chunk_end = chunk_start + chunk_size
+
+    for b in range(batch_size):
+        batch_offset = b * total_seq_len
+        chunk_offset = b * chunk_size
+        o_full_thd[batch_offset + chunk_start : batch_offset + chunk_end, :, :] = o_chunk[chunk_offset : chunk_offset + chunk_size, :, :]
+
+
+def execute_chunked_prefill_cudnn(cudnn_handle, config, q_full_thd, k_full_bhsd, v_full_bhsd):
+    batch_size, chunk_size, num_chunks = config.batch_size, config.chunk_size, config.num_chunks
+    total_seq_len = config.total_seq_len
+    h_q, h_k, h_v, d_qk, d_v, dtype = config.num_heads_q, config.num_heads_k, config.num_heads_v, config.head_dim_qk, config.head_dim_v, config.dtype
+
+    o_full_thd = torch.empty(q_full_thd.shape[0], h_q, d_v, dtype=dtype, device="cuda")
+
+    for chunk_idx in range(num_chunks):
+        print(f"  Processing chunk {chunk_idx + 1}/{num_chunks}...")
+        kv_end = (chunk_idx + 1) * chunk_size
+
+        # Extract Q chunk from THD tensor (properly handling batch layout)
+        q_chunk = extract_thd_chunk(q_full_thd, batch_size, total_seq_len, chunk_idx, chunk_size)
+
+        # K/V up to current chunk end
+        k_chunk_bhsd = create_bhsd_view(k_full_bhsd[:, :, :kv_end, :].contiguous(), batch_size, h_k, kv_end, d_qk)
+        v_chunk_bhsd = create_bhsd_view(v_full_bhsd[:, :, :kv_end, :].contiguous(), batch_size, h_v, kv_end, d_v)
+
+        seq_len_q = torch.full((batch_size,), chunk_size, dtype=torch.int32, device="cuda")
+        seq_len_kv = torch.full((batch_size,), kv_end, dtype=torch.int32, device="cuda")
+        q_ragged_offset = compute_ragged_offsets(seq_len_q, h_q, d_qk)
+        o_ragged_offset = compute_ragged_offsets(seq_len_q, h_q, d_v)
+
+        causal_offset = chunk_idx * chunk_size if config.is_causal else 0
+        graph = build_cudnn_sdpa_chunk_graph(
+            cudnn_handle, batch_size, h_q, h_k, h_v, d_qk, d_v, chunk_size, kv_end, dtype, config.attn_scale, config.is_causal, causal_offset
+        )
+        o_chunk = execute_cudnn_sdpa_chunk(
+            cudnn_handle,
+            graph,
+            q_chunk,
+            k_chunk_bhsd,
+            v_chunk_bhsd,
+            seq_len_q,
+            seq_len_kv,
+            q_ragged_offset,
+            o_ragged_offset,
+            dtype,
+            h_q,
+            d_qk,
+            d_v,
+            batch_size,
+            chunk_size,
+        )
+
+        # Store output chunk back into full THD tensor
+        store_thd_chunk(o_full_thd, o_chunk, batch_size, total_seq_len, chunk_idx, chunk_size)
+
+    return o_full_thd
+
+
+def compare_outputs(output_gpu, output_ref, atol=0.02, rtol=0.02, tag="output"):
+    actual, expected = output_gpu.float(), output_ref.float()
+    mismatches = torch.where(~torch.isclose(actual, expected, rtol=rtol, atol=atol))
+    mismatch_cnt = mismatches[0].numel()
+    if mismatch_cnt > 0:
+        print(f"\n{tag}: {mismatch_cnt:,} mismatches ({100 * mismatch_cnt / actual.numel():.2f}%)")
+        for idx in range(min(10, mismatch_cnt)):
+            pos = tuple(m[idx].item() for m in mismatches)
+            print(f"  idx{pos}: gpu={actual[pos]:+.6e}, ref={expected[pos]:+.6e}, diff={actual[pos] - expected[pos]:+.2e}")
+    else:
+        print(f"{tag}: All values match within tolerance")
+    return mismatch_cnt
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=42)
+def test_chunked_prefill_basic(cudnn_handle):
+    if LooseVersion(cudnn.backend_version_string()) < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0+")
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("Requires SM80+")
+
+    print("\n" + "=" * 80 + "\nTest: Chunked Prefill (non-causal)\n" + "=" * 80)
+    config = ChunkedPrefillConfig(
+        batch_size=2,
+        num_heads_q=8,
+        num_heads_k=8,
+        num_heads_v=8,
+        head_dim_qk=128,
+        head_dim_v=128,
+        total_seq_len=4096,
+        chunk_size=1024,
+        dtype=torch.bfloat16,
+        is_causal=False,
+    )
+    rng = torch.Generator(device="cuda").manual_seed(42)
+    seq_lens = torch.full((config.batch_size,), config.total_seq_len, dtype=torch.int32, device="cuda")
+
+    q_thd, _ = create_thd_tensor(seq_lens, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.total_seq_len, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.total_seq_len, config.head_dim_v, config.dtype, rng)
+
+    o_thd_gpu = execute_chunked_prefill_cudnn(cudnn_handle, config, q_thd, k_bhsd, v_bhsd)
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_lens, config.total_seq_len)
+    o_bhsd_ref = compute_chunked_prefill_reference(q_bhsd_ref, k_bhsd, v_bhsd, config, config.attn_scale)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_lens)
+
+    if compare_outputs(o_thd_gpu, o_thd_ref) > 0:
+        pytest.fail("Test failed")
+    print("\nTEST PASSED")
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=123)
+def test_chunked_prefill_causal(cudnn_handle):
+    if LooseVersion(cudnn.backend_version_string()) < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0+")
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("Requires SM80+")
+
+    print("\n" + "=" * 80 + "\nTest: Chunked Prefill (causal)\n" + "=" * 80)
+    config = ChunkedPrefillConfig(
+        batch_size=2,
+        num_heads_q=8,
+        num_heads_k=8,
+        num_heads_v=8,
+        head_dim_qk=128,
+        head_dim_v=128,
+        total_seq_len=4096,
+        chunk_size=1024,
+        dtype=torch.bfloat16,
+        is_causal=True,
+    )
+    rng = torch.Generator(device="cuda").manual_seed(123)
+    seq_lens = torch.full((config.batch_size,), config.total_seq_len, dtype=torch.int32, device="cuda")
+
+    q_thd, _ = create_thd_tensor(seq_lens, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.total_seq_len, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.total_seq_len, config.head_dim_v, config.dtype, rng)
+
+    o_thd_gpu = execute_chunked_prefill_cudnn(cudnn_handle, config, q_thd, k_bhsd, v_bhsd)
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_lens, config.total_seq_len)
+    o_bhsd_ref = compute_chunked_prefill_reference(q_bhsd_ref, k_bhsd, v_bhsd, config, config.attn_scale)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_lens)
+
+    if compare_outputs(o_thd_gpu, o_thd_ref) > 0:
+        pytest.fail("Test failed")
+    print("\nTEST PASSED")
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=456)
+def test_chunked_prefill_gqa(cudnn_handle):
+    if LooseVersion(cudnn.backend_version_string()) < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0+")
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("Requires SM80+")
+
+    print("\n" + "=" * 80 + "\nTest: Chunked Prefill (GQA)\n" + "=" * 80)
+    config = ChunkedPrefillConfig(
+        batch_size=2,
+        num_heads_q=8,
+        num_heads_k=2,
+        num_heads_v=2,
+        head_dim_qk=128,
+        head_dim_v=128,
+        total_seq_len=4096,
+        chunk_size=1024,
+        dtype=torch.bfloat16,
+        is_causal=False,
+    )
+    rng = torch.Generator(device="cuda").manual_seed(456)
+    seq_lens = torch.full((config.batch_size,), config.total_seq_len, dtype=torch.int32, device="cuda")
+
+    q_thd, _ = create_thd_tensor(seq_lens, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.total_seq_len, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.total_seq_len, config.head_dim_v, config.dtype, rng)
+
+    o_thd_gpu = execute_chunked_prefill_cudnn(cudnn_handle, config, q_thd, k_bhsd, v_bhsd)
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_lens, config.total_seq_len)
+    o_bhsd_ref = compute_chunked_prefill_reference(q_bhsd_ref, k_bhsd, v_bhsd, config, config.attn_scale)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_lens)
+
+    if compare_outputs(o_thd_gpu, o_thd_ref) > 0:
+        pytest.fail("Test failed")
+    print("\nTEST PASSED")
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=789)
+def test_chunked_prefill_gqa_causal(cudnn_handle):
+    if LooseVersion(cudnn.backend_version_string()) < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0+")
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("Requires SM80+")
+
+    print("\n" + "=" * 80 + "\nTest: Chunked Prefill (GQA + causal)\n" + "=" * 80)
+    config = ChunkedPrefillConfig(
+        batch_size=2,
+        num_heads_q=8,
+        num_heads_k=2,
+        num_heads_v=2,
+        head_dim_qk=128,
+        head_dim_v=128,
+        total_seq_len=4096,
+        chunk_size=1024,
+        dtype=torch.bfloat16,
+        is_causal=True,
+    )
+    rng = torch.Generator(device="cuda").manual_seed(789)
+    seq_lens = torch.full((config.batch_size,), config.total_seq_len, dtype=torch.int32, device="cuda")
+
+    q_thd, _ = create_thd_tensor(seq_lens, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.total_seq_len, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.total_seq_len, config.head_dim_v, config.dtype, rng)
+
+    o_thd_gpu = execute_chunked_prefill_cudnn(cudnn_handle, config, q_thd, k_bhsd, v_bhsd)
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_lens, config.total_seq_len)
+    o_bhsd_ref = compute_chunked_prefill_reference(q_bhsd_ref, k_bhsd, v_bhsd, config, config.attn_scale)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_lens)
+
+    if compare_outputs(o_thd_gpu, o_thd_ref) > 0:
+        pytest.fail("Test failed")
+    print("\nTEST PASSED")
+
+
+if __name__ == "__main__":
+    print("Run with: pytest -vv -s -rA test_sdpa_chunked_prefill.py")
diff --git a/third_party/cudnn-frontend/test/python/test_sdpa_thd.py b/third_party/cudnn-frontend/test/python/test_sdpa_thd.py
new file mode 100644
index 00000000..d764d130
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_sdpa_thd.py
@@ -0,0 +1,1119 @@
+"""
+Test for SDPA with dynamic shapes and THD (Token-Head-Dimension) layout.
+
+THD layout is a ragged/packed format where:
+- Q: [total_q_tokens, num_heads, head_dim] - packed Q tensor
+- K/V: can be BHSD or THD format
+- O: [total_q_tokens, num_heads, head_dim] - packed output tensor
+
+This is similar to FlashInfer's cuDNN prefill implementation.
+
+The recommended way to run tests:
+> pytest -vv -s -rA test_sdpa_dynamic_shapes.py
+"""
+
+import cudnn
+import pytest
+import torch
+import math
+from looseversion import LooseVersion
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+from test_utils import torch_fork_set_rng
+
+# =========================================
+# Helper Functions and Data Classes
+# =========================================
+
+from enum import Enum, auto
+
+
+class UIDs(Enum):
+    Q_UID = auto()
+    K_UID = auto()
+    V_UID = auto()
+    O_UID = auto()
+    RAGGED_Q_UID = auto()
+    RAGGED_O_UID = auto()
+    ACTUAL_SEQ_LENS_Q_UID = auto()
+    ACTUAL_SEQ_LENS_KV_UID = auto()
+
+
+@dataclass
+class SDPAConfig:
+    """Configuration for SDPA test."""
+
+    batch_size: int
+    num_heads_q: int
+    num_heads_k: int
+    num_heads_v: int
+    head_dim_qk: int
+    head_dim_v: int
+    max_seq_len_q: int
+    max_seq_len_kv: int
+    dtype: torch.dtype = torch.bfloat16
+    is_causal: bool = False
+    attn_scale: Optional[float] = None
+
+    def __post_init__(self):
+        if self.attn_scale is None:
+            self.attn_scale = 1.0 / math.sqrt(self.head_dim_qk)
+
+
+def convert_to_cudnn_type(torch_type: torch.dtype) -> cudnn.data_type:
+    """Convert PyTorch dtype to cuDNN data type."""
+    type_map = {
+        torch.float16: cudnn.data_type.HALF,
+        torch.bfloat16: cudnn.data_type.BFLOAT16,
+        torch.float32: cudnn.data_type.FLOAT,
+        torch.int32: cudnn.data_type.INT32,
+        torch.int64: cudnn.data_type.INT64,
+    }
+    if torch_type not in type_map:
+        raise ValueError(f"Unsupported tensor data type: {torch_type}")
+    return type_map[torch_type]
+
+
+def generate_variable_seq_lens(
+    batch_size: int,
+    max_seq_len_q: int,
+    max_seq_len_kv: int,
+    rng: torch.Generator,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Generate variable sequence lengths for Q and KV."""
+    # Generate random sequence lengths, ensuring seq_len_q <= seq_len_kv
+    seq_len_q = torch.randint(1, max_seq_len_q + 1, (batch_size,), generator=rng, dtype=torch.int32, device="cuda")
+    seq_len_kv = torch.randint(1, max_seq_len_kv + 1, (batch_size,), generator=rng, dtype=torch.int32, device="cuda")
+
+    # Ensure seq_len_q <= seq_len_kv for each batch
+    seq_len_q = torch.minimum(seq_len_q, seq_len_kv)
+
+    return seq_len_q, seq_len_kv
+
+
+def compute_ragged_offsets(
+    seq_lens: torch.Tensor,
+    num_heads: int,
+    head_dim: int,
+) -> torch.Tensor:
+    """
+    Compute exclusive prefix sum (ragged offsets) for THD layout.
+
+    For THD layout, the ragged offset for batch i is the cumulative sum of
+    (seq_len[0:i] * num_heads * head_dim).
+
+    Args:
+        seq_lens: [batch_size] - sequence lengths
+        num_heads: number of attention heads
+        head_dim: dimension per head
+
+    Returns:
+        ragged_offset: [batch_size + 1, 1, 1, 1] - exclusive prefix sum
+    """
+    batch_size = seq_lens.shape[0]
+
+    # Compute element counts per batch
+    elements_per_batch = seq_lens * num_heads * head_dim
+
+    # Exclusive prefix sum: [0, elem0, elem0+elem1, ...]
+    ragged_offset = torch.zeros(batch_size + 1, dtype=torch.int64, device=seq_lens.device)
+    ragged_offset[1:] = torch.cumsum(elements_per_batch, dim=0)
+
+    # Reshape to [batch_size+1, 1, 1, 1] as expected by cuDNN
+    ragged_offset = ragged_offset.view(-1, 1, 1, 1)
+
+    return ragged_offset
+
+
+def create_thd_tensor(
+    seq_lens: torch.Tensor,
+    num_heads: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    rng: torch.Generator,
+    mean: float = 0.0,
+    std: float = 1.0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Create a THD (Token-Head-Dimension) layout tensor.
+
+    THD layout: [total_tokens, num_heads, head_dim]
+    The tensor is packed - all sequences are concatenated.
+
+    Args:
+        seq_lens: [batch_size] - sequence length per batch
+        num_heads: number of attention heads
+        head_dim: dimension per head
+        dtype: tensor data type
+        rng: random number generator
+        mean: mean for random initialization
+        std: std for random initialization
+
+    Returns:
+        tensor: [total_tokens, num_heads, head_dim]
+        ragged_offset: [batch_size+1, 1, 1, 1]
+    """
+    total_tokens = int(seq_lens.sum().item())
+
+    # Create the packed tensor
+    tensor = torch.empty(total_tokens, num_heads, head_dim, dtype=dtype, device="cuda")
+    tensor.normal_(mean=mean, std=std, generator=rng)
+
+    # Compute ragged offsets
+    ragged_offset = compute_ragged_offsets(seq_lens, num_heads, head_dim)
+
+    return tensor, ragged_offset
+
+
+def create_bhsd_tensor(
+    batch_size: int,
+    num_heads: int,
+    max_seq_len: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    rng: torch.Generator,
+    mean: float = 0.0,
+    std: float = 1.0,
+) -> torch.Tensor:
+    """
+    Create a BHSD (Batch-Head-Seq-Dim) shape tensor with BSHD strides.
+
+    Shape: [batch_size, num_heads, max_seq_len, head_dim]
+    Strides: [seq*heads*dim, dim, heads*dim, 1] (BSHD stride order)
+
+    This is an interleaved format where memory is laid out as BSHD.
+
+    Args:
+        batch_size: batch size
+        num_heads: number of attention heads
+        max_seq_len: maximum sequence length
+        head_dim: dimension per head
+        dtype: tensor data type
+        rng: random number generator
+        mean: mean for random initialization
+        std: std for random initialization
+
+    Returns:
+        tensor: [batch_size, num_heads, max_seq_len, head_dim] with BSHD strides
+    """
+    # Allocate contiguous storage in BSHD order
+    total_elements = batch_size * max_seq_len * num_heads * head_dim
+    storage = torch.empty(total_elements, dtype=dtype, device="cuda")
+    storage.normal_(mean=mean, std=std, generator=rng)
+
+    # Create view with BHSD shape but BSHD strides
+    # Strides: [S*H*D, D, H*D, 1]
+    strides = (
+        max_seq_len * num_heads * head_dim,  # batch stride
+        head_dim,  # head stride
+        num_heads * head_dim,  # seq stride
+        1,  # dim stride
+    )
+    tensor = torch.as_strided(storage, (batch_size, num_heads, max_seq_len, head_dim), strides)
+    return tensor
+
+
+def thd_to_bhsd(
+    thd_tensor: torch.Tensor,
+    seq_lens: torch.Tensor,
+    max_seq_len: int,
+) -> torch.Tensor:
+    """
+    Convert THD layout tensor to BHSD shape with BSHD strides (for reference computation).
+
+    THD layout: [total_tokens, num_heads, head_dim] with strides [H*D, D, 1]
+    BHSD with BSHD strides: [B, H, S, D] with strides [S*H*D, D, H*D, 1]
+
+    Both layouts have the same underlying memory pattern per batch:
+    - THD[t, h, d] -> memory[t*H*D + h*D + d]
+    - BHSD[b, h, s, d] with BSHD strides -> memory[b*S*H*D + s*H*D + h*D + d]
+
+    So within a batch, the relative offsets are identical - no transpose needed!
+
+    Args:
+        thd_tensor: [total_tokens, num_heads, head_dim]
+        seq_lens: [batch_size] - sequence length per batch
+        max_seq_len: maximum sequence length for padding
+
+    Returns:
+        bhsd_tensor: [batch_size, num_heads, max_seq_len, head_dim] with BSHD strides
+    """
+    batch_size = seq_lens.shape[0]
+    _, num_heads, head_dim = thd_tensor.shape
+
+    # Allocate storage in BSHD physical order: [B, S, H, D] contiguous
+    storage = torch.zeros(batch_size, max_seq_len, num_heads, head_dim, dtype=thd_tensor.dtype, device=thd_tensor.device)
+
+    # Copy data batch by batch using direct copy - no transpose needed!
+    # THD [seq, H, D] has same memory layout as BSHD [B, S, H, D] within each batch
+    offset = 0
+    for i in range(batch_size):
+        seq_len = int(seq_lens[i].item())
+        # Direct copy: THD [seq_len, H, D] -> BSHD storage[i, :seq_len, H, D]
+        storage[i, :seq_len, :, :] = thd_tensor[offset : offset + seq_len]
+        offset += seq_len
+
+    # Create BHSD view with BSHD strides from the BSHD storage
+    # storage is [B, S, H, D] contiguous, we want [B, H, S, D] view
+    bhsd_tensor = storage.permute(0, 2, 1, 3)  # [B, S, H, D] -> [B, H, S, D]
+
+    return bhsd_tensor
+
+
+def bhsd_to_thd(
+    bhsd_tensor: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Convert BHSD shape tensor (with BSHD strides) to THD layout.
+
+    BHSD with BSHD strides: [B, H, S, D] with strides [S*H*D, D, H*D, 1]
+    THD layout: [total_tokens, num_heads, head_dim] with strides [H*D, D, 1]
+
+    Both layouts have the same underlying memory pattern per batch - no transpose needed!
+
+    Args:
+        bhsd_tensor: [batch_size, num_heads, max_seq_len, head_dim] with BSHD strides
+        seq_lens: [batch_size] - sequence length per batch
+
+    Returns:
+        thd_tensor: [total_tokens, num_heads, head_dim]
+    """
+    batch_size = seq_lens.shape[0]
+    _, num_heads, _, head_dim = bhsd_tensor.shape
+    total_tokens = int(seq_lens.sum().item())
+
+    # Create output tensor
+    thd_tensor = torch.empty(total_tokens, num_heads, head_dim, dtype=bhsd_tensor.dtype, device=bhsd_tensor.device)
+
+    # Convert BHSD [B, H, S, D] back to BSHD view [B, S, H, D] for direct copy
+    bshd_tensor = bhsd_tensor.permute(0, 2, 1, 3)  # [B, H, S, D] -> [B, S, H, D]
+
+    # Copy data batch by batch - no transpose needed!
+    # BSHD[b, s, h, d] has same memory layout as THD[t, h, d]
+    offset = 0
+    for i in range(batch_size):
+        seq_len = int(seq_lens[i].item())
+        # Direct copy: BSHD [1, seq_len, H, D] -> THD [seq_len, H, D]
+        thd_tensor[offset : offset + seq_len] = bshd_tensor[i, :seq_len, :, :]
+        offset += seq_len
+
+    return thd_tensor
+
+
+# =========================================
+# Reference Implementation
+# =========================================
+
+
+def compute_sdpa_reference(
+    q_bhsd: torch.Tensor,
+    k_bhsd: torch.Tensor,
+    v_bhsd: torch.Tensor,
+    seq_len_q: torch.Tensor,
+    seq_len_kv: torch.Tensor,
+    attn_scale: float,
+    is_causal: bool = False,
+) -> torch.Tensor:
+    """
+    Compute SDPA reference output in float32.
+
+    Args:
+        q_bhsd: [batch, heads_q, max_seq_q, head_dim_qk]
+        k_bhsd: [batch, heads_k, max_seq_kv, head_dim_qk]
+        v_bhsd: [batch, heads_v, max_seq_kv, head_dim_v]
+        seq_len_q: [batch] - actual sequence lengths for Q
+        seq_len_kv: [batch] - actual sequence lengths for K/V
+        attn_scale: attention scaling factor
+        is_causal: whether to apply causal masking
+
+    Returns:
+        o_bhsd: [batch, heads_q, max_seq_q, head_dim_v]
+    """
+    batch_size, num_heads_q, max_seq_q, head_dim_qk = q_bhsd.shape
+    _, num_heads_k, max_seq_kv, _ = k_bhsd.shape
+    _, num_heads_v, _, head_dim_v = v_bhsd.shape
+
+    # Convert to float32 for reference computation
+    q = q_bhsd.to(dtype=torch.float32)
+    k = k_bhsd.to(dtype=torch.float32)
+    v = v_bhsd.to(dtype=torch.float32)
+
+    # Handle GQA/MQA by expanding K and V
+    if num_heads_q != num_heads_k:
+        assert num_heads_q % num_heads_k == 0, "num_heads_q must be divisible by num_heads_k"
+        k = k.unsqueeze(2).expand(-1, -1, num_heads_q // num_heads_k, -1, -1)
+        k = k.reshape(batch_size, num_heads_q, max_seq_kv, head_dim_qk)
+    if num_heads_q != num_heads_v:
+        assert num_heads_q % num_heads_v == 0, "num_heads_q must be divisible by num_heads_v"
+        v = v.unsqueeze(2).expand(-1, -1, num_heads_q // num_heads_v, -1, -1)
+        v = v.reshape(batch_size, num_heads_q, max_seq_kv, head_dim_v)
+
+    # Compute attention scores: [batch, heads, seq_q, seq_kv]
+    scores = torch.einsum("bhqd,bhkd->bhqk", q, k) * attn_scale
+
+    # Create padding mask
+    device = q.device
+    q_mask = torch.zeros(batch_size, 1, max_seq_q, 1, dtype=torch.bool, device=device)
+    kv_mask = torch.zeros(batch_size, 1, 1, max_seq_kv, dtype=torch.bool, device=device)
+    for i in range(batch_size):
+        q_mask[i, :, seq_len_q[i] :, :] = True
+        kv_mask[i, :, :, seq_len_kv[i] :] = True
+
+    # Apply padding mask
+    scores = scores.masked_fill(kv_mask, float("-inf"))
+
+    # Apply causal mask if requested
+    if is_causal:
+        causal_mask = torch.ones(max_seq_q, max_seq_kv, dtype=torch.bool, device=device)
+        causal_mask.triu_(diagonal=1)
+        scores = scores.masked_fill(causal_mask, float("-inf"))
+
+    # Softmax
+    attn_weights = torch.softmax(scores, dim=-1)
+
+    # Mask out padded Q positions (set to 0)
+    attn_weights = attn_weights.masked_fill(q_mask, 0.0)
+
+    # Compute output
+    output = torch.einsum("bhqk,bhkd->bhqd", attn_weights, v)
+
+    # Zero out padded positions in output
+    output_mask = torch.zeros(batch_size, 1, max_seq_q, 1, dtype=torch.bool, device=device)
+    for i in range(batch_size):
+        output_mask[i, :, seq_len_q[i] :, :] = True
+    output = output.masked_fill(output_mask, 0.0)
+
+    return output
+
+
+# =========================================
+# cuDNN Graph Builder
+# =========================================
+
+graph_cache = {}
+
+
+def lookup_graph_from_cache(batch_size: int, h_q: int, h_k: int, h_v: int, d_qk: int, d_v: int, max_s_kv: int, causal: bool) -> cudnn.pygraph:
+    """
+    Lookup a graph from the cuDNN graph cache.
+    """
+    key = (batch_size, h_q, h_k, h_v, d_qk, d_v, max_s_kv, causal)
+
+    if key in graph_cache:
+        return graph_cache[key]
+
+    return None
+
+
+def add_to_cudnn_graph_cache(batch_size: int, h_q: int, h_k: int, h_v: int, d_qk: int, d_v: int, max_s_kv: int, causal: bool, graph: cudnn.pygraph) -> None:
+    """
+    Add a graph to the cuDNN graph cache.
+    """
+
+    key = (batch_size, h_q, h_k, h_v, d_qk, d_v, max_s_kv, causal)
+
+    graph_cache[key] = graph
+    return None
+
+
+def build_cudnn_sdpa_thd_graph(
+    cudnn_handle,
+    config: SDPAConfig,
+    seq_len_q: torch.Tensor,
+    seq_len_kv: torch.Tensor,
+    q_ragged_offset: torch.Tensor,
+    o_ragged_offset: torch.Tensor,
+    q_gpu: torch.Tensor,
+    k_gpu: torch.Tensor,
+    v_gpu: torch.Tensor,
+    o_gpu: torch.Tensor,
+):
+    """
+    Build cuDNN graph for SDPA with THD layout for Q and O.
+
+    Q and O are in THD (Token-Head-Dimension) ragged layout.
+    K and V are in BHSD (Batch-Head-Seq-Dim) layout.
+    """
+    batch_size = config.batch_size
+    h_q = config.num_heads_q
+    h_k = config.num_heads_k
+    h_v = config.num_heads_v
+    d_qk = config.head_dim_qk
+    d_v = config.head_dim_v
+    max_s_q = config.max_seq_len_q
+    max_s_kv = config.max_seq_len_kv
+
+    cudnn_dtype = convert_to_cudnn_type(config.dtype)
+
+    # Look up pre-built graph from cache
+
+    graph = lookup_graph_from_cache(batch_size, h_q, h_k, h_v, d_qk, d_v, max_s_kv, config.is_causal)
+
+    if graph is not None:
+        print("Returning existing graph since it already exists")
+        return graph
+
+    # Create the graph
+    graph = cudnn.pygraph(
+        io_data_type=cudnn_dtype,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+        handle=cudnn_handle,
+        is_dynamic_shape_enabled=True,
+    )
+
+    # Q tensor in THD layout with BHSD logical shape
+    # Physical shape: [total_q_tokens, h_q, d_qk]
+    # Logical shape for cuDNN: [batch, heads, max_seq_q, head_dim]
+    # Stride: THD -> [h_q * d_qk, d_qk, h_q * d_qk, 1] (bshd stride order)
+    q = graph.tensor(
+        dim=(batch_size, h_q, max_s_q, d_qk),
+        stride=(h_q * d_qk, d_qk, h_q * d_qk, 1),  # bshd stride order for THD
+        data_type=cudnn_dtype,
+        name="Q",
+        uid=UIDs.Q_UID.value,
+    )
+
+    # Q ragged offset tensor
+    q_ragged = graph.tensor(
+        dim=(batch_size + 1, 1, 1, 1),
+        stride=(1, 1, 1, 1),
+        data_type=cudnn.data_type.INT64,
+        name="Q_ragged_offset",
+        uid=UIDs.RAGGED_Q_UID.value,
+    )
+    q.set_ragged_offset(q_ragged)
+
+    # K tensor in BHSD layout
+    k = graph.tensor(
+        dim=(batch_size, h_k, max_s_kv, d_qk),
+        stride=(h_k * max_s_kv * d_qk, d_qk, h_k * d_qk, 1),  # bshd stride order
+        data_type=cudnn_dtype,
+        name="K",
+        uid=UIDs.K_UID.value,
+    )
+
+    # V tensor in BHSD layout
+    v = graph.tensor(
+        dim=(batch_size, h_v, max_s_kv, d_v),
+        stride=(h_v * max_s_kv * d_v, d_v, h_v * d_v, 1),  # bshd stride order
+        data_type=cudnn_dtype,
+        name="V",
+        uid=UIDs.V_UID.value,
+    )
+
+    # Sequence length tensors
+    seq_len_q_tensor = graph.tensor(
+        dim=(batch_size, 1, 1, 1),
+        stride=(1, 1, 1, 1),
+        data_type=cudnn.data_type.INT32,
+        name="seq_len_q",
+        uid=UIDs.ACTUAL_SEQ_LENS_Q_UID.value,
+    )
+
+    seq_len_kv_tensor = graph.tensor(
+        dim=(batch_size, 1, 1, 1),
+        stride=(1, 1, 1, 1),
+        data_type=cudnn.data_type.INT32,
+        name="seq_len_kv",
+        uid=UIDs.ACTUAL_SEQ_LENS_KV_UID.value,
+    )
+
+    # Call SDPA
+    o, stats = graph.sdpa(
+        name="sdpa_thd",
+        q=q,
+        k=k,
+        v=v,
+        attn_scale=config.attn_scale,
+        use_padding_mask=True,
+        seq_len_q=seq_len_q_tensor,
+        seq_len_kv=seq_len_kv_tensor,
+        use_causal_mask=config.is_causal,
+        generate_stats=False,
+    )
+
+    # Output tensor in THD layout
+    o.set_output(True).set_dim((batch_size, h_q, max_s_q, d_v)).set_stride((h_q * d_v, d_v, h_q * d_v, 1)).set_data_type(  # bshd stride order for THD
+        cudnn_dtype
+    )
+    o.set_uid(UIDs.O_UID.value)
+
+    # O ragged offset tensor (reuse Q's ragged offset structure for d_qk == d_v)
+    o_ragged = graph.tensor(
+        dim=(batch_size + 1, 1, 1, 1),
+        stride=(1, 1, 1, 1),
+        data_type=cudnn.data_type.INT64,
+        name="O_ragged_offset",
+        uid=UIDs.RAGGED_O_UID.value,
+    )
+    o.set_ragged_offset(o_ragged)
+
+    # Validate and build the graph
+    try:
+        graph.validate()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(f"Graph not supported: {e}")
+    except Exception as e:
+        pytest.fail(f"Unexpected error during graph validation: {e}")
+
+    try:
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(f"Graph not supported after validation: {e}")
+    except Exception as e:
+        pytest.fail(f"Unexpected error after graph validation: {e}")
+
+    add_to_cudnn_graph_cache(batch_size, h_q, h_k, h_v, d_qk, d_v, max_s_kv, config.is_causal, graph)
+
+    return graph
+
+
+def execute_cudnn_sdpa_thd(
+    cudnn_handle,
+    config: SDPAConfig,
+    q_gpu: torch.Tensor,
+    k_gpu: torch.Tensor,
+    v_gpu: torch.Tensor,
+    seq_len_q: torch.Tensor,
+    seq_len_kv: torch.Tensor,
+    q_ragged_offset: torch.Tensor,
+    o_ragged_offset: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Execute cuDNN SDPA with THD layout.
+
+    Args:
+        cudnn_handle: cuDNN handle
+        config: SDPA configuration
+        q_gpu: [total_q_tokens, num_heads_q, head_dim_qk] - Q in THD layout
+        k_gpu: [batch, num_heads_k, max_seq_kv, head_dim_qk] - K in BHSD layout
+        v_gpu: [batch, num_heads_v, max_seq_kv, head_dim_v] - V in BHSD layout
+        seq_len_q: [batch] - actual Q sequence lengths
+        seq_len_kv: [batch] - actual KV sequence lengths
+        q_ragged_offset: [batch+1, 1, 1, 1] - Q ragged offsets
+        o_ragged_offset: [batch+1, 1, 1, 1] - O ragged offsets
+
+    Returns:
+        o_gpu: [total_q_tokens, num_heads_q, head_dim_v] - output in THD layout
+    """
+    total_q_tokens = q_gpu.shape[0]
+
+    # Allocate output tensor
+    o_gpu = torch.empty(total_q_tokens, config.num_heads_q, config.head_dim_v, dtype=config.dtype, device="cuda")
+
+    # Reshape seq_len tensors to [batch, 1, 1, 1]
+    seq_len_q_4d = seq_len_q.view(-1, 1, 1, 1)
+    seq_len_kv_4d = seq_len_kv.view(-1, 1, 1, 1)
+
+    # Build the graph
+    graph = build_cudnn_sdpa_thd_graph(cudnn_handle, config, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset, q_gpu, k_gpu, v_gpu, o_gpu)
+
+    q_shape = [config.batch_size, config.num_heads_q, config.max_seq_len_q, config.head_dim_qk]
+    o_shape = [config.batch_size, config.num_heads_q, config.max_seq_len_q, config.head_dim_v]
+    # Create variant pack
+    variant_pack = {
+        UIDs.Q_UID.value: q_gpu,
+        UIDs.RAGGED_Q_UID.value: q_ragged_offset,
+        UIDs.K_UID.value: k_gpu,
+        UIDs.V_UID.value: v_gpu,
+        UIDs.ACTUAL_SEQ_LENS_Q_UID.value: seq_len_q_4d,
+        UIDs.ACTUAL_SEQ_LENS_KV_UID.value: seq_len_kv_4d,
+        UIDs.O_UID.value: o_gpu,
+        UIDs.RAGGED_O_UID.value: o_ragged_offset,
+    }
+
+    # Allocate workspace
+    workspace_size = graph.get_workspace_size()
+    workspace = torch.empty(workspace_size, dtype=torch.uint8, device="cuda")
+
+    # Execute
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+    override_uids = [
+        UIDs.Q_UID.value,
+        UIDs.RAGGED_Q_UID.value,
+        UIDs.K_UID.value,
+        UIDs.V_UID.value,
+        UIDs.ACTUAL_SEQ_LENS_Q_UID.value,
+        UIDs.ACTUAL_SEQ_LENS_KV_UID.value,
+        UIDs.O_UID.value,
+        UIDs.RAGGED_O_UID.value,
+    ]
+    override_shapes = [
+        q_shape,
+        q_ragged_offset.shape,
+        k_gpu.shape,
+        v_gpu.shape,
+        seq_len_q_4d.shape,
+        seq_len_kv_4d.shape,
+        o_shape,
+        o_ragged_offset.shape,
+    ]
+    override_strides = [
+        q_gpu.stride(),
+        q_ragged_offset.stride(),
+        k_gpu.stride(),
+        v_gpu.stride(),
+        seq_len_q_4d.stride(),
+        seq_len_kv_4d.stride(),
+        o_gpu.stride(),
+        o_ragged_offset.stride(),
+    ]
+    graph.execute(variant_pack, workspace, handle=cudnn_handle, override_uids=override_uids, override_shapes=override_shapes, override_strides=override_strides)
+    torch.cuda.synchronize()
+
+    return o_gpu
+
+
+# =========================================
+# Test Functions
+# =========================================
+
+
+def compare_outputs(
+    output_gpu: torch.Tensor,
+    output_ref: torch.Tensor,
+    seq_lens: torch.Tensor,
+    atol: float = 0.02,
+    rtol: float = 0.02,
+    tag: str = "output",
+) -> int:
+    """
+    Compare GPU output with reference, accounting for padding.
+
+    Returns number of mismatches.
+    """
+    # Convert THD output to BHSD for comparison if needed
+    if output_gpu.dim() == 3:  # THD layout
+        # Both should be in THD already for this comparison
+        actual = output_gpu.float()
+        expected = output_ref.float()
+    else:
+        actual = output_gpu.float()
+        expected = output_ref.float()
+
+    mismatches = torch.where(torch.isclose(actual, expected, rtol=rtol, atol=atol) == False)
+    mismatch_cnt = mismatches[0].numel()
+
+    if mismatch_cnt > 0:
+        percentage = 100 * mismatch_cnt / actual.numel()
+        print(f"\n{tag}: {mismatch_cnt:,} mismatches ({percentage:.2f}%)")
+
+        # Show first few mismatches
+        for idx in range(min(10, mismatch_cnt)):
+            pos = tuple(m[idx].item() for m in mismatches)
+            diff = actual[pos] - expected[pos]
+            print(f"  idx{pos}: gpu={actual[pos]:+.6e}, ref={expected[pos]:+.6e}, diff={diff:+.2e}")
+    else:
+        print(f"{tag}: All values match within tolerance (atol={atol}, rtol={rtol})")
+
+    return mismatch_cnt
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=42)
+def test_sdpa_thd_dynamic_shapes(cudnn_handle):
+    """Basic test for SDPA with THD layout."""
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0 or higher")
+
+    if torch.cuda.get_device_capability()[0] < 9:
+        pytest.skip("SDPA with THD layout requires SM90 or higher")
+
+    print("\n" + "=" * 80)
+    print("Test: SDPA with THD layout (basic)")
+    print("=" * 80)
+
+    # Configurations
+    configs = [
+        SDPAConfig(
+            batch_size=4,
+            num_heads_q=8,
+            num_heads_k=8,
+            num_heads_v=8,
+            head_dim_qk=128,
+            head_dim_v=128,
+            max_seq_len_q=256,
+            max_seq_len_kv=512,
+            dtype=torch.bfloat16,
+            is_causal=False,
+        ),
+        SDPAConfig(
+            batch_size=4,
+            num_heads_q=8,
+            num_heads_k=8,
+            num_heads_v=8,
+            head_dim_qk=128,
+            head_dim_v=128,
+            max_seq_len_q=512,
+            max_seq_len_kv=512,
+            dtype=torch.bfloat16,
+            is_causal=False,
+        ),
+        SDPAConfig(
+            batch_size=4,
+            num_heads_q=8,
+            num_heads_k=8,
+            num_heads_v=8,
+            head_dim_qk=128,
+            head_dim_v=128,
+            max_seq_len_q=384,
+            max_seq_len_kv=512,
+            dtype=torch.bfloat16,
+            is_causal=False,
+        ),
+    ]
+
+    rng = torch.Generator(device="cuda").manual_seed(42)
+    for config in configs:
+        print(
+            f"Config: batch={config.batch_size}, h_q={config.num_heads_q}, "
+            f"h_k={config.num_heads_k}, h_v={config.num_heads_v}, "
+            f"d_qk={config.head_dim_qk}, d_v={config.head_dim_v}, "
+            f"max_s_q={config.max_seq_len_q}, max_s_kv={config.max_seq_len_kv}"
+        )
+
+        # Generate variable sequence lengths
+        seq_len_q, seq_len_kv = generate_variable_seq_lens(config.batch_size, config.max_seq_len_q, config.max_seq_len_kv, rng)
+        print(f"seq_len_q: {seq_len_q.tolist()}")
+        print(f"seq_len_kv: {seq_len_kv.tolist()}")
+
+        # Create Q in THD layout
+        q_thd, q_ragged_offset = create_thd_tensor(seq_len_q, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+        print(f"Q shape (THD): {q_thd.shape}")
+
+        # Create K, V in BHSD layout
+        k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.max_seq_len_kv, config.head_dim_qk, config.dtype, rng)
+        v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.max_seq_len_kv, config.head_dim_v, config.dtype, rng)
+        print(f"K shape (BHSD): {k_bhsd.shape}")
+        print(f"V shape (BHSD): {v_bhsd.shape}")
+
+        # Compute O ragged offsets (same structure as Q for d_qk == d_v)
+        o_ragged_offset = compute_ragged_offsets(seq_len_q, config.num_heads_q, config.head_dim_v)
+
+        # Execute cuDNN SDPA
+        print("\nExecuting cuDNN SDPA with THD layout...")
+        o_thd_gpu = execute_cudnn_sdpa_thd(cudnn_handle, config, q_thd, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset)
+        print(f"Output shape (THD): {o_thd_gpu.shape}")
+
+        # Compute reference
+        print("\nComputing reference output...")
+        # Convert Q from THD to BHSD for reference
+        q_bhsd_ref = thd_to_bhsd(q_thd, seq_len_q, config.max_seq_len_q)
+
+        # print(f"Q BHSD shape: {q_bhsd_ref.shape} {q_bhsd_ref[0, :, :, 0:10]}")
+        # print(f"Q  THD shape: {q_thd.shape} {q_thd[0, :, 0:10]}")
+
+        o_bhsd_ref = compute_sdpa_reference(q_bhsd_ref, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, config.attn_scale, config.is_causal)
+
+        # Convert reference output from BHSD to THD
+        o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_len_q)
+
+        # Compare outputs
+        print("\nComparing outputs...")
+        err_count = compare_outputs(o_thd_gpu, o_thd_ref, seq_len_q, atol=0.02, rtol=0.02)
+
+        if err_count > 0:
+            pytest.fail(f"SDPA THD test failed with {err_count} mismatches")
+        else:
+            print("\n" + "=" * 80)
+            print("TEST PASSED: SDPA with THD layout")
+            print("=" * 80)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=123)
+def test_sdpa_thd_gqa(cudnn_handle):
+    """Test SDPA with THD layout and GQA (Grouped Query Attention)."""
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0 or higher")
+
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("SDPA with THD layout requires SM80 or higher")
+
+    print("\n" + "=" * 80)
+    print("Test: SDPA with THD layout + GQA")
+    print("=" * 80)
+
+    # Configuration with GQA (h_q > h_k = h_v)
+    config = SDPAConfig(
+        batch_size=4,
+        num_heads_q=8,
+        num_heads_k=2,  # GQA: 4 Q heads per K head
+        num_heads_v=2,
+        head_dim_qk=64,
+        head_dim_v=64,
+        max_seq_len_q=128,
+        max_seq_len_kv=256,
+        dtype=torch.bfloat16,
+        is_causal=False,
+    )
+
+    print(
+        f"Config: batch={config.batch_size}, h_q={config.num_heads_q}, "
+        f"h_k={config.num_heads_k}, h_v={config.num_heads_v}, "
+        f"d_qk={config.head_dim_qk}, d_v={config.head_dim_v}"
+    )
+
+    rng = torch.Generator(device="cuda").manual_seed(123)
+
+    # Generate variable sequence lengths
+    seq_len_q, seq_len_kv = generate_variable_seq_lens(config.batch_size, config.max_seq_len_q, config.max_seq_len_kv, rng)
+    print(f"seq_len_q: {seq_len_q.tolist()}")
+    print(f"seq_len_kv: {seq_len_kv.tolist()}")
+
+    # Create Q in THD layout
+    q_thd, q_ragged_offset = create_thd_tensor(seq_len_q, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+
+    # Create K, V in BHSD layout
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.max_seq_len_kv, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.max_seq_len_kv, config.head_dim_v, config.dtype, rng)
+
+    # Compute O ragged offsets
+    o_ragged_offset = compute_ragged_offsets(seq_len_q, config.num_heads_q, config.head_dim_v)
+
+    # Execute cuDNN SDPA
+    print("\nExecuting cuDNN SDPA with THD + GQA...")
+    o_thd_gpu = execute_cudnn_sdpa_thd(cudnn_handle, config, q_thd, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset)
+
+    # Compute reference
+    print("\nComputing reference output...")
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_len_q, config.max_seq_len_q)
+    o_bhsd_ref = compute_sdpa_reference(q_bhsd_ref, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, config.attn_scale, config.is_causal)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_len_q)
+
+    # Compare outputs
+    print("\nComparing outputs...")
+    err_count = compare_outputs(o_thd_gpu, o_thd_ref, seq_len_q, atol=0.02, rtol=0.02)
+
+    if err_count > 0:
+        pytest.fail(f"SDPA THD + GQA test failed with {err_count} mismatches")
+    else:
+        print("\n" + "=" * 80)
+        print("TEST PASSED: SDPA with THD layout + GQA")
+        print("=" * 80)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=456)
+def test_sdpa_thd_causal(cudnn_handle):
+    """Test SDPA with THD layout and causal masking."""
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0 or higher")
+
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("SDPA with THD layout requires SM80 or higher")
+
+    print("\n" + "=" * 80)
+    print("Test: SDPA with THD layout + Causal Masking")
+    print("=" * 80)
+
+    # Configuration
+    config = SDPAConfig(
+        batch_size=4,
+        num_heads_q=8,
+        num_heads_k=8,
+        num_heads_v=8,
+        head_dim_qk=64,
+        head_dim_v=64,
+        max_seq_len_q=256,
+        max_seq_len_kv=256,
+        dtype=torch.bfloat16,
+        is_causal=True,  # Enable causal masking
+    )
+
+    print(f"Config: batch={config.batch_size}, h_q={config.num_heads_q}, " f"is_causal={config.is_causal}")
+
+    rng = torch.Generator(device="cuda").manual_seed(456)
+
+    # Generate variable sequence lengths (ensure seq_len_q == seq_len_kv for causal)
+    seq_len_q, seq_len_kv = generate_variable_seq_lens(config.batch_size, config.max_seq_len_q, config.max_seq_len_kv, rng)
+    # For causal attention, typically s_q == s_kv
+    seq_len_kv = seq_len_q.clone()
+
+    print(f"seq_len_q: {seq_len_q.tolist()}")
+    print(f"seq_len_kv: {seq_len_kv.tolist()}")
+
+    # Create tensors
+    q_thd, q_ragged_offset = create_thd_tensor(seq_len_q, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.max_seq_len_kv, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.max_seq_len_kv, config.head_dim_v, config.dtype, rng)
+    o_ragged_offset = compute_ragged_offsets(seq_len_q, config.num_heads_q, config.head_dim_v)
+
+    # Execute cuDNN SDPA
+    print("\nExecuting cuDNN SDPA with THD + Causal...")
+    o_thd_gpu = execute_cudnn_sdpa_thd(cudnn_handle, config, q_thd, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset)
+
+    # Compute reference
+    print("\nComputing reference output...")
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_len_q, config.max_seq_len_q)
+    o_bhsd_ref = compute_sdpa_reference(q_bhsd_ref, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, config.attn_scale, config.is_causal)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_len_q)
+
+    # Compare outputs
+    print("\nComparing outputs...")
+    err_count = compare_outputs(o_thd_gpu, o_thd_ref, seq_len_q, atol=0.02, rtol=0.02)
+
+    if err_count > 0:
+        pytest.fail(f"SDPA THD + Causal test failed with {err_count} mismatches")
+    else:
+        print("\n" + "=" * 80)
+        print("TEST PASSED: SDPA with THD layout + Causal Masking")
+        print("=" * 80)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=789)
+def test_sdpa_thd_seq1(cudnn_handle):
+    """Test SDPA with THD layout for seq_len_q=1 (decode-like)."""
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0 or higher")
+
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("SDPA with THD layout requires SM80 or higher")
+
+    print("\n" + "=" * 80)
+    print("Test: SDPA with THD layout (seq_q=1, decode-like)")
+    print("=" * 80)
+
+    # Configuration for decode-like scenario (s_q=1)
+    config = SDPAConfig(
+        batch_size=8,
+        num_heads_q=8,
+        num_heads_k=8,
+        num_heads_v=8,
+        head_dim_qk=128,
+        head_dim_v=128,
+        max_seq_len_q=1,  # Single token query
+        max_seq_len_kv=512,  # Long context
+        dtype=torch.bfloat16,
+        is_causal=False,
+    )
+
+    print(
+        f"Config: batch={config.batch_size}, h={config.num_heads_q}, "
+        f"d={config.head_dim_qk}, max_s_q={config.max_seq_len_q}, max_s_kv={config.max_seq_len_kv}"
+    )
+
+    rng = torch.Generator(device="cuda").manual_seed(789)
+
+    # All batches have seq_len_q = 1
+    seq_len_q = torch.ones(config.batch_size, dtype=torch.int32, device="cuda")
+    # Variable KV lengths
+    seq_len_kv = torch.randint(1, config.max_seq_len_kv + 1, (config.batch_size,), generator=rng, dtype=torch.int32, device="cuda")
+    print(f"seq_len_q: {seq_len_q.tolist()}")
+    print(f"seq_len_kv: {seq_len_kv.tolist()}")
+
+    # Create tensors
+    q_thd, q_ragged_offset = create_thd_tensor(seq_len_q, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.max_seq_len_kv, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.max_seq_len_kv, config.head_dim_v, config.dtype, rng)
+    o_ragged_offset = compute_ragged_offsets(seq_len_q, config.num_heads_q, config.head_dim_v)
+
+    print(f"Q shape (THD): {q_thd.shape}")
+
+    # Execute cuDNN SDPA
+    print("\nExecuting cuDNN SDPA with THD (seq_q=1)...")
+    o_thd_gpu = execute_cudnn_sdpa_thd(cudnn_handle, config, q_thd, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset)
+
+    # Compute reference
+    print("\nComputing reference output...")
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_len_q, config.max_seq_len_q)
+    o_bhsd_ref = compute_sdpa_reference(q_bhsd_ref, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, config.attn_scale, config.is_causal)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_len_q)
+
+    # Compare outputs
+    print("\nComparing outputs...")
+    err_count = compare_outputs(o_thd_gpu, o_thd_ref, seq_len_q, atol=0.02, rtol=0.02)
+
+    if err_count > 0:
+        pytest.fail(f"SDPA THD seq_q=1 test failed with {err_count} mismatches")
+    else:
+        print("\n" + "=" * 80)
+        print("TEST PASSED: SDPA with THD layout (seq_q=1)")
+        print("=" * 80)
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=999)
+def test_sdpa_thd_large_batch(cudnn_handle):
+    """Test SDPA with THD layout with larger batch sizes."""
+    cudnn_version = LooseVersion(cudnn.backend_version_string())
+    if cudnn_version < "9.10.0":
+        pytest.skip("THD layout requires cuDNN 9.10.0 or higher")
+
+    if torch.cuda.get_device_capability()[0] < 8:
+        pytest.skip("SDPA with THD layout requires SM80 or higher")
+
+    print("\n" + "=" * 80)
+    print("Test: SDPA with THD layout (large batch)")
+    print("=" * 80)
+
+    # Configuration with larger batch
+    config = SDPAConfig(
+        batch_size=32,
+        num_heads_q=8,
+        num_heads_k=2,  # GQA
+        num_heads_v=2,
+        head_dim_qk=128,
+        head_dim_v=128,
+        max_seq_len_q=512,
+        max_seq_len_kv=512,
+        dtype=torch.bfloat16,
+        is_causal=False,
+    )
+
+    print(f"Config: batch={config.batch_size}, h_q={config.num_heads_q}, " f"h_k={config.num_heads_k}, d={config.head_dim_qk}")
+
+    rng = torch.Generator(device="cuda").manual_seed(999)
+
+    # Generate variable sequence lengths
+    seq_len_q, seq_len_kv = generate_variable_seq_lens(config.batch_size, config.max_seq_len_q, config.max_seq_len_kv, rng)
+    print(f"seq_len_q range: [{seq_len_q.min().item()}, {seq_len_q.max().item()}]")
+    print(f"seq_len_kv range: [{seq_len_kv.min().item()}, {seq_len_kv.max().item()}]")
+    print(f"Total Q tokens: {seq_len_q.sum().item()}")
+
+    # Create tensors
+    q_thd, q_ragged_offset = create_thd_tensor(seq_len_q, config.num_heads_q, config.head_dim_qk, config.dtype, rng)
+    k_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_k, config.max_seq_len_kv, config.head_dim_qk, config.dtype, rng)
+    v_bhsd = create_bhsd_tensor(config.batch_size, config.num_heads_v, config.max_seq_len_kv, config.head_dim_v, config.dtype, rng)
+    o_ragged_offset = compute_ragged_offsets(seq_len_q, config.num_heads_q, config.head_dim_v)
+
+    # Execute cuDNN SDPA
+    print("\nExecuting cuDNN SDPA with THD (large batch)...")
+    o_thd_gpu = execute_cudnn_sdpa_thd(cudnn_handle, config, q_thd, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, q_ragged_offset, o_ragged_offset)
+
+    # Compute reference
+    print("\nComputing reference output...")
+    q_bhsd_ref = thd_to_bhsd(q_thd, seq_len_q, config.max_seq_len_q)
+    o_bhsd_ref = compute_sdpa_reference(q_bhsd_ref, k_bhsd, v_bhsd, seq_len_q, seq_len_kv, config.attn_scale, config.is_causal)
+    o_thd_ref = bhsd_to_thd(o_bhsd_ref, seq_len_q)
+
+    # Compare outputs
+    print("\nComparing outputs...")
+    err_count = compare_outputs(o_thd_gpu, o_thd_ref, seq_len_q, atol=0.02, rtol=0.02)
+
+    if err_count > 0:
+        pytest.fail(f"SDPA THD large batch test failed with {err_count} mismatches")
+    else:
+        print("\n" + "=" * 80)
+        print("TEST PASSED: SDPA with THD layout (large batch)")
+        print("=" * 80)
+
+
+# =========================================
+# Main Entry Point
+# =========================================
+
+if __name__ == "__main__":
+    print("This is a pytest script.")
+    print("Run with: pytest -vv -s -rA test_sdpa_dynamic_shapes.py")
diff --git a/third_party/cudnn-frontend/test/python/test_sdpa_with_caching.py b/third_party/cudnn-frontend/test/python/test_sdpa_with_caching.py
new file mode 100644
index 00000000..c5640ee5
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_sdpa_with_caching.py
@@ -0,0 +1,343 @@
+import pytest
+from test_utils import torch_fork_set_rng
+
+import torch
+import cudnn
+import math
+import random
+
+from enum import Enum
+
+import logging
+
+logger = logging.getLogger()
+
+batch_buckets = {
+    1: None,
+    2: None,
+    4: None,
+    8: None,
+    12: None,
+    16: None,
+    20: None,
+    24: None,
+    28: None,
+    32: None,
+}
+
+cuda_graphs = {}
+
+H_Q = H_K = H_V = 6
+D_QK = D_VO = 128  # If you are changing D_VO != D_QK, you need to change the code in create_qkv_tensors for ragged offsets of O
+
+MAX_SEQ_LEN_Q = 1024
+MAX_SEQ_LEN_KV = 1024
+
+ATTN_SCALE = float(1.0 / math.sqrt(D_QK))
+
+device = "cuda:0"
+
+
+class UIDs(Enum):
+    RESERVED_INVALID_UID = 0
+
+    Q_UID = 1  # Query tensor
+    K_UID = 2  # Key cache tensor
+    V_UID = 3  # Value cache tensor
+
+    ACTUAL_SEQ_LENS_Q_UID = 100  # Actual sequence lengths for query tensor
+    ACTUAL_SEQ_LENS_KV_UID = 101  # Actual sequence lengths for key/value tensor
+
+    BLOCK_TABLES_UID = 200  # Block tables tensor
+    BLOCK_TABLES_K_UID = 201  # Block tables tensor for key
+    BLOCK_TABLES_V_UID = 202  # Block tables tensor for value
+
+    RAGGED_Q_UID = 50  # Ragged query tensor
+    RAGGED_O_UID = 51  # Ragged output tensor
+    RAGGED_STATS_UID = 52  # Ragged stats tensor
+    RAGGED_K_UID = 53  # Ragged key tensor
+    RAGGED_V_UID = 54  # Ragged value tensor
+
+    O_UID = 1000  # Output tensor
+    STATS_UID = 1001  # Stats tensor
+
+
+def create_qkv_tensors(batch_size, actual_seq_lens_q, actual_seq_lens_kv):
+
+    cumsum_s_qo = torch.sum(actual_seq_lens_q)
+    q_gpu = torch.randn(cumsum_s_qo, H_Q, D_QK, device=device, dtype=torch.bfloat16)
+
+    q_indptr = torch.cat(
+        [
+            torch.tensor([0], device=device),
+            torch.cumsum(actual_seq_lens_q.view(-1), dim=0) * D_QK * H_Q,
+        ]
+    ).int()
+
+    q_indptr = torch.as_strided(q_indptr, (batch_size + 1, 1, 1, 1), (1, 1, 1, 1))
+
+    k_gpu = (
+        torch.randn(batch_size * H_K * MAX_SEQ_LEN_KV * D_QK)
+        .half()
+        .cuda()
+        .as_strided(
+            (batch_size, H_K, MAX_SEQ_LEN_KV, D_QK),
+            (MAX_SEQ_LEN_KV * H_K * D_QK, D_QK, H_K * D_QK, 1),
+        )
+    )
+    v_gpu = (
+        torch.randn(batch_size * H_V * MAX_SEQ_LEN_KV * D_VO)
+        .half()
+        .cuda()
+        .as_strided(
+            (batch_size, H_V, MAX_SEQ_LEN_KV, D_VO),
+            (MAX_SEQ_LEN_KV * H_V * D_VO, D_VO, H_V * D_VO, 1),
+        )
+    )
+
+    out_gpu = torch.empty_like(q_gpu)
+
+    return q_gpu, k_gpu, v_gpu, q_indptr, out_gpu
+
+
+def _sdpa_key_fn(handle, batch_size):
+    return batch_size
+
+
+@cudnn.jit(heur_modes=[cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+@cudnn.graph_cache(key_fn=_sdpa_key_fn)
+def lookup_or_create_sdpa_graph(handle, batch_size):
+    with cudnn.graph(handle) as (g, _):
+        logger.info(f"Creating graph with batch_size: {batch_size}")
+        cudnn_q = g.tensor(
+            name="q",
+            dim=(batch_size, H_Q, MAX_SEQ_LEN_Q, D_QK),
+            stride=(H_Q * D_QK, D_QK, D_QK * H_Q, 1),
+            data_type=cudnn.data_type.BFLOAT16,
+            uid=UIDs.Q_UID.value,
+        )
+
+        ragged_q = g.tensor(
+            name="ragged_q",
+            dim=(batch_size + 1, 1, 1, 1),
+            stride=(1, 1, 1, 1),
+            data_type=cudnn.data_type.INT32,
+            uid=UIDs.RAGGED_Q_UID.value,
+        )
+        cudnn_q.set_ragged_offset(ragged_q)
+
+        cudnn_k = g.tensor(
+            name="k",
+            dim=(batch_size, H_K, MAX_SEQ_LEN_KV, D_QK),
+            stride=(H_K * D_QK, D_QK, D_QK * H_K, 1),
+            data_type=cudnn.data_type.BFLOAT16,
+            uid=UIDs.K_UID.value,
+        )
+
+        cudnn_v = g.tensor(
+            name="v",
+            dim=(batch_size, H_V, MAX_SEQ_LEN_KV, D_VO),
+            stride=(H_V * D_VO, D_VO, D_VO * H_V, 1),
+            data_type=cudnn.data_type.BFLOAT16,
+            uid=UIDs.V_UID.value,
+        )
+
+        cudnn_actual_seq_lens_q = g.tensor(
+            name="actual_seq_lens_q",
+            dim=(batch_size, 1, 1, 1),
+            stride=(1, 1, 1, 1),
+            data_type=cudnn.data_type.INT32,
+            uid=UIDs.ACTUAL_SEQ_LENS_Q_UID.value,
+        )
+
+        cudnn_actual_seq_lens_kv = g.tensor(
+            name="actual_seq_lens_kv",
+            dim=(batch_size, 1, 1, 1),
+            stride=(1, 1, 1, 1),
+            data_type=cudnn.data_type.INT32,
+            uid=UIDs.ACTUAL_SEQ_LENS_KV_UID.value,
+        )
+
+        O, Stats = g.sdpa(
+            name="sdpa",
+            q=cudnn_q,
+            k=cudnn_k,
+            v=cudnn_v,
+            seq_len_q=cudnn_actual_seq_lens_q,
+            seq_len_kv=cudnn_actual_seq_lens_kv,
+            use_padding_mask=True,
+            attn_scale=ATTN_SCALE,
+            generate_stats=False,
+            use_causal_mask_bottom_right=False,
+            compute_data_type=cudnn.data_type.FLOAT,
+        )
+
+        O.set_uid(UIDs.O_UID.value).set_output(True).set_dim([batch_size, H_Q, MAX_SEQ_LEN_Q, D_VO]).set_stride(
+            [MAX_SEQ_LEN_Q * D_VO * H_Q, D_VO, D_VO * H_Q, 1]
+        ).set_data_type(cudnn.data_type.BFLOAT16)
+
+        O.set_ragged_offset(ragged_q)
+
+        tensors_to_return = [cudnn_q, cudnn_k, cudnn_v, O]
+
+        assert Stats is None
+
+        return g, tensors_to_return
+
+    return g
+
+
+def pad_batch_size(batch_size, actual_seq_lens_q, actual_seq_lens_kv, ragged_offset_q):
+    batch_buckets_keys = list(batch_buckets.keys())
+    batch_size_padded = next((b for b in batch_buckets_keys if b >= batch_size), batch_buckets_keys[-1])
+    zeros = torch.zeros(
+        (batch_size_padded - batch_size, 1, 1, 1),
+        dtype=actual_seq_lens_q.dtype,
+        device=actual_seq_lens_q.device,
+    )
+    actual_seq_lens_q_padded = torch.cat([actual_seq_lens_q, zeros], dim=0)
+    actual_seq_lens_kv_padded = torch.cat([actual_seq_lens_kv, zeros], dim=0)
+    ragged_offset_q_padded = torch.cat([ragged_offset_q, zeros], dim=0)
+    return (
+        batch_size_padded,
+        actual_seq_lens_q_padded,
+        actual_seq_lens_kv_padded,
+        ragged_offset_q_padded,
+    )
+
+
+def execute_sample(g, var_map, workspace, cudnn_handle):
+    torch.cuda.nvtx.range_push("graph.execute sample")
+    g.execute(var_map, workspace, handle=cudnn_handle)
+    torch.cuda.nvtx.range_pop()
+
+
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_ragged_sdpa_with_caching(cudnn_handle):
+    if cudnn.backend_version_string() < "9.13.0":
+        pytest.skip("This sample is only supported on cuDNN 9.13.0 or higher")
+
+    if torch.cuda.get_device_properties(0).major < 9:
+        pytest.skip("Ragged SDPA is only supported on Hopper or higher")
+
+    # test set up basics
+    seed = 1
+
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    logger.info(f"Initilizing the buckets")
+
+    # The goal of the batch_buckets is to create a graph for each batch size in the batch_buckets
+    # This is done to avoid creating a new graph for each sample, at run time.
+
+    # In this sample, we bucket the batch sizes. But in theory this can be done for any arbitrary parameter.
+    # For example, you can bucket by sequence length, or masking pattern, etc.
+
+    for _batch_size in batch_buckets.keys():
+        batch_buckets[_batch_size] = lookup_or_create_sdpa_graph(cudnn_handle, _batch_size)
+
+    logger.info(f"Buckets initialized")
+
+    sample_size = 4
+    iter_count = 10
+
+    # Samples is a list of tuples of (padded_batch_size, padded_actual_seq_lens_q, padded_actual_seq_lens_kv, ragged_offset_q, q_gpu, k_gpu, v_gpu, out_gpu)
+    samples = []
+    for _ in range(sample_size):
+        batch_size = random.randint(1, 32)
+
+        actual_seq_lens_q = torch.randint(
+            1,
+            MAX_SEQ_LEN_Q + 1,
+            (batch_size, 1, 1, 1),
+            dtype=torch.int32,
+            device=device,
+        )
+        actual_seq_lens_kv = torch.randint(
+            MAX_SEQ_LEN_Q,
+            MAX_SEQ_LEN_KV + 1,
+            (batch_size, 1, 1, 1),
+            dtype=torch.int32,
+            device=device,
+        )
+
+        q_gpu, k_gpu, v_gpu, ragged_offset_q, out_gpu = create_qkv_tensors(batch_size, actual_seq_lens_q, actual_seq_lens_kv)
+
+        samples.append(
+            (
+                batch_size,
+                actual_seq_lens_q,
+                actual_seq_lens_kv,
+                ragged_offset_q,
+                q_gpu,
+                k_gpu,
+                v_gpu,
+                out_gpu,
+            )
+        )
+
+    # We pre-allocate the workspace to avoid creating a new workspace for each sample
+    workspace = torch.empty(1024 * 1024, device="cuda", dtype=torch.uint8)
+
+    for iter in range(iter_count):
+
+        for sample in samples:
+
+            torch.cuda.nvtx.range_push(f"Execute sample {iter}")
+            (
+                batch_size,
+                actual_seq_lens_q,
+                actual_seq_lens_kv,
+                ragged_offset_q,
+                q_gpu,
+                k_gpu,
+                v_gpu,
+                out_gpu,
+            ) = sample
+
+            torch.cuda.nvtx.range_push("Padding the tensors of interest")
+            (
+                padded_batch_size,
+                padded_actual_seq_lens_q,
+                padded_actual_seq_lens_kv,
+                padded_ragged_offset_q,
+            ) = pad_batch_size(batch_size, actual_seq_lens_q, actual_seq_lens_kv, ragged_offset_q)
+            torch.cuda.nvtx.range_pop()
+
+            logger.info(f"Executing the sample with actual batch_size: {batch_size} and padded_batch_size: {padded_batch_size}")
+
+            # This will not create a new graph, it will return the graph from the bucket by the key function
+            torch.cuda.nvtx.range_push("Look up the graph")
+            g, tensors = lookup_or_create_sdpa_graph(
+                cudnn_handle,
+                padded_batch_size,
+            )
+            torch.cuda.nvtx.range_pop()
+
+            var_map = {
+                UIDs.Q_UID.value: q_gpu,
+                UIDs.K_UID.value: k_gpu,
+                UIDs.V_UID.value: v_gpu,
+                UIDs.O_UID.value: out_gpu,
+                UIDs.ACTUAL_SEQ_LENS_Q_UID.value: padded_actual_seq_lens_q,
+                UIDs.ACTUAL_SEQ_LENS_KV_UID.value: padded_actual_seq_lens_kv,
+                UIDs.RAGGED_Q_UID.value: padded_ragged_offset_q,
+            }
+
+            if cuda_graphs.get(padded_batch_size) is None:
+                # Wrap the sample execution into a CUDA graph and execute the captured graph
+                cuda_graph = torch.cuda.CUDAGraph()
+
+                # Capture the execution into the CUDA graph
+                with torch.cuda.graph(cuda_graph):
+                    execute_sample(g, var_map, workspace, cudnn_handle)
+
+                cuda_graphs[padded_batch_size] = cuda_graph
+
+            # Now, launch the captured CUDA graph
+            cuda_graphs[padded_batch_size].replay()
+            torch.cuda.synchronize()
+
+            torch.cuda.nvtx.range_pop()
diff --git a/third_party/cudnn-frontend/test/python/test_silu_and_mul.py b/third_party/cudnn-frontend/test/python/test_silu_and_mul.py
new file mode 100644
index 00000000..c6bc472c
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_silu_and_mul.py
@@ -0,0 +1,218 @@
+import cudnn
+from looseversion import LooseVersion
+import pytest
+
+import torch
+from torch.profiler import profile, record_function, ProfilerActivity
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "9.3",
+    reason="Reduction mul is not supported below cudnn 9.3",
+)
+@pytest.mark.skipif(
+    hasattr(torch, "float8_e4m3fn") is False,
+    reason="torch does not have fp8 data types",
+)
+@pytest.mark.L0
+def test_gemm_silu_and_mul(cudnn_handle):
+
+    # setup
+    M = 64
+    N = 64
+    K = 64
+
+    # cudnn graph
+    graph = cudnn.pygraph(
+        handle=cudnn_handle,
+        name="cudnn_graph_0",
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    X_gpu = torch.randint(-8, 8, (1, M, K), requires_grad=False, device="cuda").to(dtype=torch.float8_e4m3fn)
+    W_gpu = torch.randint(-8, 8, (2, K, N), requires_grad=False, device="cuda").to(dtype=torch.float8_e4m3fn)
+    C_gpu = torch.zeros(1, M, N, requires_grad=False, device="cuda").to(dtype=torch.float)
+
+    scale = 0.5
+    X_DQ_cpu = torch.full((1, 1, 1), scale, dtype=torch.float32, device="cpu")
+    W_DQ_cpu = torch.full((1, 1, 1), scale, dtype=torch.float32, device="cpu")
+    C_Q_cpu = torch.full((1, 1, 1), scale, dtype=torch.float32, device="cpu")
+    B_mask_gpu = torch.tensor([[[1]], [[0]]], dtype=torch.int32, device="cuda")
+
+    X = graph.tensor(
+        name="X",
+        dim=X_gpu.size(),
+        stride=X_gpu.stride(),
+        data_type=cudnn.data_type.FP8_E4M3,
+    )
+    W = graph.tensor(
+        name="W",
+        dim=W_gpu.size(),
+        stride=W_gpu.stride(),
+        data_type=cudnn.data_type.FP8_E4M3,
+    )
+    C0 = graph.matmul(X, W)
+
+    X_DQ = graph.tensor(
+        name="X_DQ",
+        dim=X_DQ_cpu.size(),
+        stride=X_DQ_cpu.stride(),
+        data_type=cudnn.data_type.FLOAT,
+        is_pass_by_value=True,
+    )
+    C1 = graph.mul(C0, X_DQ)
+
+    W_DQ = graph.tensor(
+        name="W_DQ",
+        dim=W_DQ_cpu.size(),
+        stride=W_DQ_cpu.stride(),
+        data_type=cudnn.data_type.FLOAT,
+        is_pass_by_value=True,
+    )
+    C2 = graph.mul(C1, W_DQ)
+
+    C3 = graph.mul(graph.sigmoid(C2), C2)
+
+    B_mask = graph.tensor(
+        name="B_mask",
+        dim=B_mask_gpu.size(),
+        stride=B_mask_gpu.stride(),
+        data_type=cudnn.data_type.INT32,
+    )
+    C_combined = graph.binary_select(C2, C3, B_mask)
+
+    C = graph.reduction(C_combined, mode=cudnn.reduction_mode.MUL)
+    C.set_dim([1, M, N]).set_stride([M * N, N, 1]).set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    # The output of reductino operation has to be fp32.
+    # Plus, the data is in global memory so its not possible to fuse anything now.
+    # C_Q = graph.tensor(
+    #     name="C_Q",
+    #     dim=C_Q_cpu.size(),
+    #     stride=C_Q_cpu.stride(),
+    #     data_type=cudnn.data_type.FLOAT,
+    #     is_pass_by_value=True,
+    # )
+    # C_fp8 = graph.mul(C, C_Q)
+    # C_fp8.set_output(True)
+
+    try:
+        graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    except cudnn.cudnnGraphNotSupportedError as e:
+        pytest.skip(repr(e))
+    except Exception as e:
+        pytest.fail(repr(e))
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        graph.execute(
+            {
+                X: X_gpu,
+                W: W_gpu,
+                X_DQ: X_DQ_cpu,
+                W_DQ: W_DQ_cpu,
+                B_mask: B_mask_gpu,
+                C: C_gpu,
+            },
+            workspace,
+            handle=cudnn_handle,
+        )
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+    # Compare
+    torch.cuda.synchronize()
+
+
+@pytest.mark.skipif(
+    hasattr(torch, "float8_e4m3fn") is False,
+    reason="torch does not have fp8 data types",
+)
+@pytest.mark.L0
+def test_silu_and_mul_and_quantization(cudnn_handle):
+
+    # setup
+    M = 64
+    N = 64
+
+    # cudnn graph
+    graph = cudnn.pygraph(
+        handle=cudnn_handle,
+        name="cudnn_graph_0",
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+
+    C2a_gpu = torch.randint(-8, 8, (1, M, N), requires_grad=False, device="cuda").to(dtype=torch.float8_e4m3fn)
+    C2b_gpu = torch.randint(-8, 8, (1, M, N), requires_grad=False, device="cuda").to(dtype=torch.float8_e4m3fn)
+    C_gpu = torch.empty(1, M, N, requires_grad=False, device="cuda").to(dtype=torch.float8_e4m3fn)
+
+    scale = 0.5
+    C2_DQ_cpu = torch.full((1, 1, 1), scale, dtype=torch.float32, device="cpu")
+    C_Q_cpu = torch.full((1, 1, 1), scale, dtype=torch.float32, device="cpu")
+
+    C2a = graph.tensor(
+        name="C2a",
+        dim=C2a_gpu.size(),
+        stride=C2a_gpu.stride(),
+        data_type=cudnn.data_type.FP8_E4M3,
+    )
+    C2b = graph.tensor(
+        name="C2b",
+        dim=C2b_gpu.size(),
+        stride=C2b_gpu.stride(),
+        data_type=cudnn.data_type.FP8_E4M3,
+    )
+
+    C2_DQ = graph.tensor(
+        name="C2_DQ",
+        dim=C2_DQ_cpu.size(),
+        stride=C2_DQ_cpu.stride(),
+        data_type=cudnn.data_type.FLOAT,
+        is_pass_by_value=True,
+    )
+    C2a_fp32 = graph.mul(C2a, C2_DQ)
+    C2b_fp32 = graph.mul(C2b, C2_DQ)
+
+    C3 = graph.mul(graph.sigmoid(C2b_fp32), C2b_fp32)
+
+    C_fp32 = graph.mul(C2a_fp32, C3)
+    C_Q = graph.tensor(
+        name="C_Q",
+        dim=C_Q_cpu.size(),
+        stride=C_Q_cpu.stride(),
+        data_type=cudnn.data_type.FLOAT,
+        is_pass_by_value=True,
+    )
+    C_fp8 = graph.mul(C_fp32, C_Q)
+    C_fp8.set_output(True).set_data_type(cudnn.data_type.FP8_E4M3)
+
+    try:
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        graph.execute(
+            {
+                C2a: C2a_gpu,
+                C2b: C2b_gpu,
+                C2_DQ: C2_DQ_cpu,
+                C_Q: C_Q_cpu,
+                C_fp8: C_gpu,
+            },
+            workspace,
+            handle=cudnn_handle,
+        )
+    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+    # Compare
+    torch.cuda.synchronize()
diff --git a/third_party/cudnn-frontend/test/python/test_slice.py b/third_party/cudnn-frontend/test/python/test_slice.py
new file mode 100644
index 00000000..745d91ed
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_slice.py
@@ -0,0 +1,67 @@
+import cudnn
+import itertools
+import pytest
+import torch
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+
+@pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="requires Hopper or newer arch")
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_int8_bf16_matmul_slice(cudnn_handle):
+
+    # matmul problem size
+    Batch, M, N, K = 16, 32, 64, 128
+    slice_B = slice(8, None)
+    slice_M = slice(16, None)
+    slice_N = slice(32, None)
+    slice_K = slice(None, None)
+
+    # Initialize input tensors
+    A_gpu = 2 * torch.randn(Batch, M, K, requires_grad=False, device="cuda", dtype=torch.bfloat16) - 0.25
+    A_slice_gpu = A_gpu[slice_B, slice_M, :]
+
+    B_gpu = 3 * torch.randn(Batch, K, N, requires_grad=False, device="cuda", dtype=torch.bfloat16) - 1.25
+    B_slice_gpu = B_gpu[slice_B, :, slice_N]
+
+    stream = torch.cuda.current_stream().cuda_stream
+    cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+    # Make cudnn graph
+    graph = cudnn.pygraph(handle=cudnn_handle)
+
+    # Create the two non-virtual input tensors A and B.
+    # There are read from global memory.
+    A = graph.tensor_like(A_gpu)
+    A_slice = graph.slice(A, [slice_B, slice_M, slice_K], name="A_slice")
+    B = graph.tensor_like(B_gpu)
+    B_slice = graph.slice(B, [slice_B, slice_K, slice_N], name="B_slice")
+
+    C = graph.matmul(name="matmul", A=A_slice, B=B_slice, compute_data_type=cudnn.data_type.FLOAT)
+    C.set_output(True).set_data_type(cudnn.data_type.BFLOAT16)
+
+    graph.validate()
+    graph.build_operation_graph()
+
+    try:
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+    except cudnn.cudnnGraphNotSupportedError as e:
+        print(f"TEST WAIVED: unsupported graph. {e}")
+        pytest.skip("TEST WAIVED: unsupported graph.")
+
+    graph.build_plans(cudnn.build_plan_policy.HEURISTICS_CHOICE)
+
+    # Run pyt reference
+    C_expected = torch.matmul(A_slice_gpu.to(torch.bfloat16), B_slice_gpu.to(torch.bfloat16))
+
+    # Run cudnn graph
+    C_actual = torch.zeros_like(C_expected)
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+    graph.execute({A: A_gpu, B: B_gpu, C: C_actual}, workspace, handle=cudnn_handle)
+    print(A_gpu.data_ptr())
+    torch.cuda.synchronize()
+    # compare'em
+    torch.testing.assert_close(C_expected, C_actual)
diff --git a/third_party/cudnn-frontend/test/python/test_utils.py b/third_party/cudnn-frontend/test/python/test_utils.py
new file mode 100644
index 00000000..3af0975b
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_utils.py
@@ -0,0 +1,17 @@
+import torch
+import functools
+
+
+# decorator function to fork the RNG and set the seed for each tests
+def torch_fork_set_rng(seed=None):
+    def decorator_(func):
+        @functools.wraps(func)
+        def wrapper_(*args, **kwargs):
+            with torch.random.fork_rng(devices=range(torch.cuda.device_count())):
+                if seed is not None:
+                    torch.manual_seed(seed)
+                return func(*args, **kwargs)
+
+        return wrapper_
+
+    return decorator_
diff --git a/third_party/cudnn-frontend/test/python/test_wgrads.py b/third_party/cudnn-frontend/test/python/test_wgrads.py
new file mode 100644
index 00000000..2d1a74b5
--- /dev/null
+++ b/third_party/cudnn-frontend/test/python/test_wgrads.py
@@ -0,0 +1,99 @@
+import cudnn
+import pytest
+import torch
+from looseversion import LooseVersion
+
+from test_utils import torch_fork_set_rng
+
+
+def is_ampere_arch():
+    major, minor = torch.cuda.get_device_capability()
+    cc = major * 10 + minor
+    return 80 <= cc and cc < 89
+
+
+def is_hopper_arch():
+    major, minor = torch.cuda.get_device_capability()
+    cc = major * 10 + minor
+    return 90 <= cc
+
+
+n = 4
+c = 32
+k = 64
+padding = [1, 1]
+stride = [1, 1]
+dilation = [1, 1]
+
+
+@pytest.mark.skipif(
+    LooseVersion(cudnn.backend_version_string()) < "8.8",
+    reason="requires cudnn 8.8 or higher",
+)
+@pytest.mark.L0
+@torch_fork_set_rng(seed=0)
+def test_scale_bias_relu_wgrad(cudnn_handle):
+
+    try:
+        if not is_ampere_arch() and not is_hopper_arch():
+            pytest.skip("SBR Wgrad is only supported on ampere and hopper.")
+
+        # Reference
+        X_gpu = torch.randn(n, c, 32, 32, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+        DY_gpu = torch.randn(n, k, 32, 32, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+        scale = torch.randn(1, c, 1, 1, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) * 0.01
+        bias = torch.randn(1, c, 1, 1, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last) * 0.01
+        DW_actual = torch.randn(k, c, 3, 3, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+
+        stream = torch.cuda.current_stream().cuda_stream
+        cudnn.set_stream(handle=cudnn_handle, stream=stream)
+
+        graph = cudnn.pygraph(
+            io_data_type=cudnn.data_type.HALF,
+            intermediate_data_type=cudnn.data_type.FLOAT,
+            compute_data_type=cudnn.data_type.FLOAT,
+            handle=cudnn_handle,
+        )
+
+        # X  = graph.tensor(name = "X",  dim = X_gpu.size(), stride = X_gpu.stride(), data_type = cudnn._compiled_module.data_type.DOUBLE)
+        X = graph.tensor(name="X", dim=X_gpu.size(), stride=X_gpu.stride(), data_type=X_gpu.dtype)
+        DY = graph.tensor(name="DY", dim=DY_gpu.size(), stride=DY_gpu.stride(), data_type=DY_gpu.dtype)
+        B = graph.tensor(name="B", dim=bias.size(), stride=bias.stride(), data_type=bias.dtype)
+        S = graph.tensor(name="S", dim=scale.size(), stride=scale.stride(), data_type=scale.dtype)
+
+        scale_output = graph.scale(name="scale", input=X, scale=S)
+        bias_output = graph.bias(name="bias", input=scale_output, bias=B)
+
+        relu_output = graph.relu(name="relu", input=bias_output)
+
+        wgrad_output = graph.conv_wgrad(
+            name="wgrad",
+            image=relu_output,
+            loss=DY,
+            padding=padding,
+            stride=stride,
+            dilation=dilation,
+        )
+        wgrad_output.set_output(True).set_dim([k, c, 3, 3])
+
+        graph.validate()
+        graph.build_operation_graph()
+        graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+        graph.check_support()
+        graph.build_plans()
+
+        workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+        DW_actual = torch.zeros_like(X_gpu)
+
+        print("Executing test_scale_bias_relu_wgrad")
+        graph.execute(
+            {X: X_gpu, DY: DY_gpu, B: bias, S: scale, wgrad_output: DW_actual},
+            workspace,
+            handle=cudnn_handle,
+        )
+
+        torch.cuda.synchronize()
+
+    except cudnn.cudnnGraphNotSupportedError as ex:
+        print(ex)
diff --git a/third_party/cudnn-frontend/tools/json_reproducer/README.md b/third_party/cudnn-frontend/tools/json_reproducer/README.md
new file mode 100644
index 00000000..2b023268
--- /dev/null
+++ b/third_party/cudnn-frontend/tools/json_reproducer/README.md
@@ -0,0 +1,22 @@
+## Json reproducer
+
+### Usage
+
+```
+usage: json_parser.py [-h] -i INPUT_FILE [-v]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT_FILE, --input_file INPUT_FILE
+                        Input file name
+  -v, --verbose         Set logging level to max
+```
+
+
+### Notes
+
+Input is a json representation of graph before validate is called. 
+
+For c++ users, this json can be generated by calling `std::cout << graph << std::endl;` or `graph.print()`.
+
+For python users, this is called by calling the `print(graph)` method.
\ No newline at end of file
diff --git a/third_party/cudnn-frontend/tools/json_reproducer/json_parser.py b/third_party/cudnn-frontend/tools/json_reproducer/json_parser.py
new file mode 100644
index 00000000..81e568a1
--- /dev/null
+++ b/third_party/cudnn-frontend/tools/json_reproducer/json_parser.py
@@ -0,0 +1,42 @@
+import argparse
+import os
+
+# Create an argument parser
+parser = argparse.ArgumentParser()
+
+# Add arguments
+parser.add_argument("-i", "--input_file", help="Input file name", required=True)
+
+parser.add_argument(
+    "-v", "--verbose", help="Set logging level to max", action="store_true"
+)
+
+# Parse the arguments
+args = parser.parse_args()
+
+with open(args.input_file) as f:
+    data = f.read().replace("\n", "").replace("    ", "")
+
+import cudnn
+
+if args.verbose:
+    os.environ["CUDNN_LOGLEVEL_DBG"] = "3"
+else:
+    os.environ["CUDNN_LOGLEVEL_DBG"] = "2"
+
+try:
+    handle = cudnn.create_handle()
+
+    graph = cudnn.pygraph(handle=handle)
+
+    graph.deserialize(data)
+
+    graph.build([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+
+    print("Graph built successfully and can be executed.")
+
+except Exception as e:
+    print("[cudnn frontend error]")
+    print(e)
+    print("[cudnn backend error]")
+    print(cudnn.get_last_error_string())
diff --git a/third_party/cudnn-frontend/tools/json_reproducer/jsons/graph0.json b/third_party/cudnn-frontend/tools/json_reproducer/jsons/graph0.json
new file mode 100644
index 00000000..df19a963
--- /dev/null
+++ b/third_party/cudnn-frontend/tools/json_reproducer/jsons/graph0.json
@@ -0,0 +1,92 @@
+{
+    "context": {
+        "compute_data_type": "FLOAT",
+        "intermediate_data_type": "FLOAT",
+        "io_data_type": "HALF",
+        "name": ""
+    },
+    "nodes": [
+        {
+            "compute_data_type": null,
+            "dilation": [1,1],
+            "inputs": {
+                "W": "W",
+                "X": "X"
+            },
+            "name": "",
+            "outputs": {
+                "Y": "::Y"
+            },
+            "post_padding": [0,1],
+            "pre_padding": [0,1],
+            "stride": [2,3],
+            "tag": "CONV_FPROP"
+        },
+        {
+            "axis": null,
+            "compute_data_type": null,
+            "inputs": {
+                "IN_0": "::Y"
+            },
+            "mode": "RELU_FWD",
+            "name": "relu",
+            "outputs": {
+                "OUT_0": "relu::OUT_0"
+            },
+            "relu_lower_clip": "3F000000",
+            "relu_lower_clip_slope": null,
+            "relu_upper_clip": "3F0CCCCD",
+            "tag": "POINTWISE"
+        }
+    ],
+    "tensors": {
+        "::Y": {
+            "data_type": null,
+            "dim": [],
+            "is_pass_by_value": false,
+            "is_virtual": true,
+            "name": "::Y",
+            "pass_by_value": null,
+            "reordering_type": "NONE",
+            "stride": [],
+            "uid": 0,
+            "uid_assigned": false
+        },
+        "W": {
+            "data_type": "HALF",
+            "dim": [54,40,3,4],
+            "is_pass_by_value": false,
+            "is_virtual": false,
+            "name": "W",
+            "pass_by_value": null,
+            "reordering_type": "NONE",
+            "stride": [480,1,160,40],
+            "uid": 0,
+            "uid_assigned": false
+        },
+        "X": {
+            "data_type": "HALF",
+            "dim": [20,40,30,40],
+            "is_pass_by_value": false,
+            "is_virtual": false,
+            "name": "X",
+            "pass_by_value": null,
+            "reordering_type": "NONE",
+            "stride": [48000,1,1600,40],
+            "uid": 0,
+            "uid_assigned": false
+        },
+        "relu::OUT_0": {
+            "data_type": null,
+            "dim": [],
+            "is_pass_by_value": false,
+            "is_virtual": false,
+            "name": "relu::OUT_0",
+            "pass_by_value": null,
+            "reordering_type": "NONE",
+            "stride": [],
+            "uid": 0,
+            "uid_assigned": false
+        }
+    }
+}
diff --git a/third_party/flash_attention b/third_party/flash_attention
new file mode 160000
index 00000000..ceb1099e
--- /dev/null
+++ b/third_party/flash_attention
@@ -0,0 +1 @@
+Subproject commit ceb1099ee0b9d6b8ca9426d54d0b6d8598a48e19